基于Python Scrapy实现的豆瓣电影数据采集爬虫系统 含数据库SQL和全部源代码
# -*- coding: utf-8 -*-
"""
@Author : nesta
@Email : 572645517@qq.com
@Software: PyCharm
@project : movie
@File : MovieSpider.py
@Time : 2018/4/26 9:18
"""
from scrapy.spiders import Spider
from scrapy.http import Request
from scrapy.selector import Selector
from movie.items import MovieItem
class MovieSpider(Spider):
name = 'movie'
url = u'https://movie.douban.com/top250'
start_urls = [u'https://movie.douban.com/top250']
def parse(self, response):
item = MovieItem()
selector = Selector(response) # 解析
movies = selector.xpath('//div[@class="info"]')
for movie in movies:
title = movie.xpath('div[@class="hd"]/a/span/text()').extract()
fullTitle = ''
for each in title:
fullTitle += each
movieInfo = movie.xpath('div[@class="bd"]/p/text()').extract()
star = movie.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract()[0]
quote = movie.xpath('div[@class="bd"]/p/span/text()').extract()
if quote:
quote = quote[0]
else:
quote = ''
item['title'] = fullTitle
item['movieInfo'] = ';'.join(movieInfo).replace(' ', '').replace('\n', '')
item['star'] = star[0]
item['quote'] = quote
yield item
nextPage = selector.xpath('//span[@class="next"]/link/@href').extract()
if nextPage:
nextPage = nextPage[0]
print(self.url + str(nextPage))
yield Request(self.url + str(nextPage), callback=self.parse)