基于Python Scrapy实现的百思不得姐段子的数据采集爬虫系统 含全部源代码
import scrapy
from budejie.items import BudejieItem
class BudejieSpider(scrapy.Spider):
"""百思不得姐段子的爬虫"""
name = 'budejie'
start_urls = ['http://www.budejie.com/text/']
total_page = 50
def parse(self, response):
current_page = int(response.css(u'a.z-crt::text').extract_first())
print u'current page: {}'.format(current_page)
lies = response.css(u'div.j-r-list >ul >li')
for li in lies:
username = li.css(u'a.u-user-name::text').extract_first()
user_url = li.css(u'div.u-txt a::attr(href)').extract_first()
content = u'\n'.join(li.css(u'div.j-r-list-c-desc a::text').extract())
content_url = li.css(u'div.j-r-list-c-desc a::attr(href)').extract_first()
yield BudejieItem(
username=username,
content=content,
user_url=user_url,
content_url=content_url,
)
if current_page < self.total_page:
next_page_url = self.start_urls[0] + '{}'.format(current_page + 1)
yield scrapy.Request(next_page_url)