本篇是python实例编写：糗事百科，目的是介绍最基础的scrapy框架的写法和运行过程。

功能描述

目标：获取糗事百科所所有信息的作者，url和内容，需要爬取多个页面，/page/1~/page/13。

输出：保存到文件中（json格式）

编写步骤

编写spider处理链接爬取和页面解析，编写pipelines处理信息存储

步骤1：建立工程和Spider模板

1
2
3

\>scrapy startproject qsbk
\>cd BaiduStocks
\>scrapy genspider qsbk_spider qiushibaike.com

步骤2：编写Spider

import scrapy
from urllib.parse import urljoin
from qsbk.items import QsbkItem


class QsbkSpiderSpider(scrapy.Spider):
    name = 'qsbk_spider'
    allowed_domains = ['qiushibaike.com']
    start_urls = ['http://qiushibaike.com/text/page/1/']

    def parse(self, response):
        # selectorList 类型
        duanziDivs = response.xpath("//div[@class='col1 old-style-col1']/div")

        i = 0

        for div in duanziDivs:
            # div: Selector类型
            # div.xpath()：selectorList类型
            author = div.xpath(".//h2/text()").get().strip()
            # get()函数，取到第一个结果（str类型）, get()<=>extract_first()
            # extract(): 提取所有结果组成列表，每个元素是str类型
            # strip(): 去掉前后的空格

            href = div.xpath('./a/@href').get() # 获取a标签内的href的属性值
            url = urljoin(self.start_urls[0],href) # 跳转到详情页

            content = div.xpath(".//div[@class='content']//text()").extract()
            # html中该div下有span标签，span中才是内容，//text()直接提取该内容

            content = ''.join(content).strip()

            # tplt = "{0:{2}<20}\t{1:^50}"
            # print(tplt.format(author,url,chr(12288)))
            # print(content)
            # print('=' * 100)

            # duanzi = {'author':author,'content':content} # 即为一个个的存储项，item，写法如下 ⬇
            item = QsbkItem(author=author, content=content)

            i += 1
            print('生成器调用：',i)

            # yield duanzi
            yield item

            # 若不使用生成器，即注释上面的yield item，可使用以下方法
            # items = []
            # items.append(item)
            # return items
            # 返回所有的items=，在pipelines中也可以被解析

测试该函数可以注释最后的生成器步骤，取消注释前面的打印函数。

对setting.py的设置

user-agent = 'Mozilla/5.0' # 设置头信息

ROBOTSTXT_OBEY = False # 不遵循robots协议

LOG_LEVEL = "WARN" # 取消warning以下的提示信息，使控制台更干净

步骤3：编写ITEM Pipelines

注意该文件中，类里的三个方法: 打开、运行、关闭。

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import json


class QsbkPipeline:
    def __init__(self):
        self.fp = open('duanzi.json', 'w', encoding='utf-8')

    # 爬虫启动时执行的函数，比如打开文件，也可写在构造函数中
    def open_spider(self, spider):
        print('--------> 爬虫开始了... <---------')

    # 爬虫运行时，即qsbk_spider.py 中的生成器依次调用, item就是yield依次传来的值
    def process_item(self, item, spider):
        # item_json = json.dumps(item,ensure_ascii=False) # 第一次，未指定item
        item_json = json.dumps(dict(item),ensure_ascii=False) # 第二次，指定items.py中的类，此处用dict()函数将其转换为字典
        self.fp.write(item_json + '\n')

        print('pipelines调用')

        return item

    # 爬虫结束时执行的函数，关闭文件
    def close_spider(self, spider):
        self.fp.close()
        print('--------> 爬虫结束了... <---------')

items.py编写

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class QsbkItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    
    author = scrapy.Field() # 选择需要输出结果的属性
    content = scrapy.Field()

注意

若要使用ITEM Pipelines，需要在setting.py中开启。

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'qsbk.pipelines.QsbkPipeline': 300, # 当有多个pipelines，后面的值越小代表越优先执行
}

优化数据存储的方式

方法一：使用`JsonItemExporter`

from scrapy.exporters import JsonItemExporter # scrapy框架里的JSON导出器


class QsbkPipeline:
    def __init__(self):
        self.fp = open('duanzi.json', 'wb') # 二进制方法打开 , 不能指定 encoding = 'utf-8'
        self.exporter = JsonItemExporter(self.fp, ensure_ascii = False, encoding = 'utf-8')
        self.exporter.start_exporting() # 开始

        def open_spider(self, spider):
        print('--------> 爬虫开始了... <---------')

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        print('pipelines调用')

        return item

    def close_spider(self, spider):
        self.exporter.finish_exporting() # 该方法是把所有的item存为一个列表，每个元素是一个item
        self.fp.close()
        print('--------> 爬虫结束了... <---------')

缺点：所有item存在一个列表中，一起写入文件，较消耗内存。

需要开始和结束。

但整体满足格式要求，一个大列表，每个元素是一个item的json格式数据。

方法二：使用`JsonLinesItemExporter`

from scrapy.exporters import JsonLinesItemExporter


class QsbkPipeline:
    def __init__(self):
        self.fp = open('duanzi.json', 'wb') 
        self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii = False, encoding = 'utf-8')

    def open_spider(self, spider):
        print('--------> 爬虫开始了... <---------')

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        print('pipelines调用') 

        return item

    def close_spider(self, spider):
        self.fp.close()
        print('--------> 爬虫结束了... <---------')

此方法把每个item作为一个单独的字典类型存入文件。

不需要开始和结束。

缺点：每行数据为一个字典（json格式），但整个文件并不满足json格式。

抓取多个页面

主要是最后几行代码

import scrapy
from urllib.parse import urljoin
from qsbk.items import QsbkItem


class QsbkSpiderSpider(scrapy.Spider):
    name = 'qsbk_spider'
    allowed_domains = ['qiushibaike.com']
    start_urls = ['http://qiushibaike.com/text/page/1/']

    def parse(self, response):
        duanziDivs = response.xpath("//div[@class='col1 old-style-col1']/div")
        i = 0
        for div in duanziDivs:
            author = div.xpath(".//h2/text()").get().strip()
            href = div.xpath('./a/@href').get() # 获取a标签内的href的属性值
            url = urljoin(self.start_urls[0],href) # 跳转到详情页
            content = div.xpath(".//div[@class='content']//text()").extract()
            content = ''.join(content).strip()
            item = QsbkItem(author=author, content=content)
            i += 1
            print('生成器调用：',i)
            yield item
            
            
        next_path = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get() # 底部显示页面的最后一个li标签，‘下一页’

        if not next_path: # 如果没有下一页
            return
        else:
            next_url = urljoin(self.start_urls[0], next_path)
            yield scrapy.Request(next_url, callback=self.parse) # 利用当前页的url，执行上面的parse解析函数