基于Python Scrapy实现的蜂鸟数据采集爬虫系统 含代理、日志处理和全部源代码等
import scrapy
from fengniao.items import FengniaoItem
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import TimeoutError, TCPTimedOutError, DNSLookupError, ConnectionRefusedError
class FengniaoclawerSpider(scrapy.Spider):
name = 'fengniaoClawer'
allowed_domains = ['fengniao.com']
# 爬虫自定义设置,会覆盖 settings.py 文件中的设置
custom_settings = {
'LOG_LEVEL': 'DEBUG', # 定义log等级
'DOWNLOAD_DELAY': 0, # 下载延时
'COOKIES_ENABLED': False, # enabled by default
'DEFAULT_REQUEST_HEADERS': {
# 'Host': 'www.fengniao.com',
'Referer': 'https://www.fengniao.com',
},
# 管道文件,优先级按照由小到大依次进入
'ITEM_PIPELINES': {
'fengniao.pipelines.ImagePipeline': 100,
'fengniao.pipelines.FengniaoPipeline': 300,
},
# 关于下载图片部分
'IMAGES_STORE': 'fengniaoPhoto', # 没有则新建
'IMAGES_EXPIRES': 90, # 图片有效期,已经存在的图片在这个时间段内不会再下载
'IMAGES_MIN_HEIGHT': 100, # 图片最小尺寸(高度),低于这个高度的图片不会下载
'IMAGES_MIN_WIDTH': 100, # 图片最小尺寸(宽度),低于这个宽度的图片不会下载
# 下载中间件,优先级按照由小到大依次进入
'DOWNLOADER_MIDDLEWARES': {
'fengniao.middlewares.ProxiesMiddleware': 400,
'fengniao.middlewares.HeadersMiddleware': 543,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
},
'DEPTH_PRIORITY': 1, # BFS,是以starts_url为准,局部BFS,受CONCURRENT_REQUESTS影响
'SCHEDULER_DISK_QUEUE': 'scrapy.squeues.PickleFifoDiskQueue',
'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeues.FifoMemoryQueue',
'REDIRECT_PRIORITY_ADJUST': 2, # Default: +2
'RETRY_PRIORITY_ADJUST': -1, # Default: -1
'RETRY_TIMES': 8, # 重试次数
# Default: 2, can also be specified per-request using max_retry_times attribute of Request.meta
'DOWNLOAD_TIMEOUT': 30,
# This timeout can be set per spider using download_timeout spider attribute and per-request using download_timeout Request.meta key
# 'DUPEFILTER_CLASS': "scrapy_redis.dupefilter.RFPDupeFilter",
# 'SCHEDULER': "scrapy_redis.scheduler.Scheduler",
# 'SCHEDULER_PERSIST': False, # Don't cleanup red