基于Python Scrapy实现的爬取豆瓣读书9分
榜单的书籍数据采集爬虫系统 含数据集和全部源代码
# -*- coding: utf-8 -*-
import scrapy
import re
from doubanbook.items import DoubanbookItem
class DbbookSpider(scrapy.Spider):
name = "dbbook"
# allowed_domains = ["https://www.douban.com/doulist/1264675/"]
start_urls = (
'https://www.douban.com/doulist/1264675/',
)
URL = 'https://www.douban.com/doulist/1264675/?start=PAGE&sort=seq&sub_type='
def parse(self, response):
# print response.body
item = DoubanbookItem()
selector = scrapy.Selector(response)
books = selector.xpath('//div[@class="bd doulist-subject"]')
for each in books:
title = each.xpath('div[@class="title"]/a/text()').extract()[0]
rate = each.xpath('div[@class="rating"]/span[@class="rating_nums"]/text()').extract()[0]
author = re.search('(.*?)