第一章 引言 1 第二章 相关技术介绍 2 2.1 JAVA线程 2 2.1.1 线程概述 2 2.1.2 JAVA线程模型 2 2.1.3 创建线程 3 2.1.4 JAVA中的线程的生命周期 4 2.1.5 JAVA线程的结束方式 4 2.1.6 多线程同步 5 2.2 URL消重 5 2.2.1 URL消重的意义 5 2.2.2 网络爬虫URL去重储存库设计 5 2.2.3 LRU算法实现URL消重 7 2.3 URL类访问网络 8 2.4 爬行策略浅析 8 2.4.1宽度或深度优先搜索策略 8 2.4.2 聚焦搜索策略 9 2.4.3基于内容评价的搜索策略 9 2.4.4 基于链接结构评价的搜索策略 10 2.4.5 基于巩固学习的聚焦搜索 11 2.4.6 基于语境图的聚焦搜索 11 第三章 系统需求分析及模块设计 13 3.1 系统需求分析 13 3.2 SPIDER体系结构 13 3.3 各主要功能模块(类)设计 14 3.4 SPIDER工作过程 14 第四章 系统分析与设计 16 4.1 SPIDER构造分析 16 4.2 爬行策略分析 17 4.3 URL抽取,解析和保存 18 4.3.1 URL抽取 18 4.3.2 URL解析 19 4.3.3 URL保存 19 第五章 系统实现 21 5.1 实现工具 21 5.2 爬虫工作 21 5.3 URL解析 22 5.4 URL队列管理 24 5.4.1 URL消重处理 24 5.4.2 URL等待队列维护 26 5.4.3 数据库设计 27 第六章 系统测试 29 第七章 结论 32 参考文献 33 致谢 34 外文资料原文 35 译文 51
2021-07-10 20:02:02 599KB java 毕业设计 网络爬虫 设计报告
基于Python Scrapy实现的拉勾网全站职位数据采集 爬虫系统 含数据库处理 # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy,re from scrapy.loader import ItemLoader from scrapy.loader.processors import MapCompose, TakeFirst from w3lib.html import remove_tags def extract_num(text): #从字符串中提取出数字 match_re = re.match(".*?(\d+).*", text) if match_re: nums = int(match_re.group(1)) else: nums = 0 return nums def replace_splash(value): '''去除/''' return value.replace("/", "") def handle_strip(value): '''空格''' return value.strip() def handle_jobaddr(value): '''去查看地图''' addr_list = value.split("\n") addr_list = [item.strip() for item in addr_list if item.strip() != "查看地图"] return "".join(addr_list) class LagouJobItemLoader(ItemLoader): #自定义itemloader default_output_processor = TakeFirst() class LagouJobItem(scrapy.Item): #拉勾网职位 title = scrapy.Field() url = scrapy.Field() salary = scrapy.Field() job_city = scrapy.Field( input_processor=MapCompose(replace_splash), ) work_years = scrapy.Field( input_processor=MapCompose(replace_splash), ) degree_need = scrapy.Field( input_processor=MapCompose(replace_splash), ) job_type = scrapy.Field() publish_time = scrapy.Field() job_advantage = scrapy.Field() job_desc = scrapy.Field( input_processor=MapCompose(handle_strip), ) job_addr = scrapy.Field( input_processor=MapCompose(remove_tags, handle_jobaddr), ) company_name = scrapy.Field( input_processor=MapCompose(handle_strip), ) company_url = scrapy.Field() crawl_time = scrapy.Field() crawl_update_time = scrapy.Field() def get_insert_sql(self): insert_sql = """ insert into lagou_job(title, url, salary, job_city, work_years, degree_need, job_type, publish_time
2021-07-10 17:02:48 7KB python scapy 爬虫 拉勾
基于Python Scrapy实现的拉勾网全站职位数据采集 爬虫系统 含数据库处理和全部源代码 # -*- coding: utf-8 -*- import scrapy from meiju100.items import Meiju100Item class MeijuSpider(scrapy.Spider): name = "meiju" allowed_domains = ["meijutt.com"] start_urls = ['http://www.meijutt.com/new100.html'] def parse(self, response): items = [] subSelector = response.xpath('//ul[@class="top-list fn-clear"]/li') for sub in subSelector: item = Meiju100Item() item['storyName'] = sub.xpath('./h5/a/text()').extract() item['storyState'] = sub.xpath('./span[1]/font/text()').extract() if item['storyState']: pass else: item['storyState'] = sub.xpath('./span[1]/text()').extract() item['tvStation'] = sub.xpath('./span[2]/text()').extract() if item['tvStation']: pass else: item['tvStation'] = [u'未知'] item['updateTime'] = sub.xpath('./div[2]/text()').extract() if item['updateTime']: pass else: item['updateTime'] = sub.xpath('./div[2]/font/text()').extract() items.append(item) return items
2021-07-10 17:02:48 14KB Python scrapy 爬虫 数据采集
基于Python Scrapy实现的百思不得姐段子的数据采集爬虫系统 含全部源代码 import scrapy from budejie.items import BudejieItem class BudejieSpider(scrapy.Spider): """百思不得姐段子的爬虫""" name = 'budejie' start_urls = ['http://www.budejie.com/text/'] total_page = 50 def parse(self, response): current_page = int(response.css(u'a.z-crt::text').extract_first()) print u'current page: {}'.format(current_page) lies = response.css(u'div.j-r-list >ul >li') for li in lies: username = li.css(u'a.u-user-name::text').extract_first() user_url = li.css(u'div.u-txt a::attr(href)').extract_first() content = u'\n'.join(li.css(u'div.j-r-list-c-desc a::text').extract()) content_url = li.css(u'div.j-r-list-c-desc a::attr(href)').extract_first() yield BudejieItem( username=username, content=content, user_url=user_url, content_url=content_url, ) if current_page < self.total_page: next_page_url = self.start_urls[0] + '{}'.format(current_page + 1) yield scrapy.Request(next_page_url)
2021-07-10 17:02:47 13KB python scrapy 数据采集 段子
基于Python Scrapy实现的爬取豆瓣读书9分榜单的书籍数据采集爬虫系统 含数据集和全部源代码 # -*- coding: utf-8 -*- import scrapy import re from doubanbook.items import DoubanbookItem class DbbookSpider(scrapy.Spider): name = "dbbook" # allowed_domains = ["https://www.douban.com/doulist/1264675/"] start_urls = ( 'https://www.douban.com/doulist/1264675/', ) URL = 'https://www.douban.com/doulist/1264675/?start=PAGE&sort=seq&sub_type=' def parse(self, response): # print response.body item = DoubanbookItem() selector = scrapy.Selector(response) books = selector.xpath('//div[@class="bd doulist-subject"]') for each in books: title = each.xpath('div[@class="title"]/a/text()').extract()[0] rate = each.xpath('div[@class="rating"]/span[@class="rating_nums"]/text()').extract()[0] author = re.search('(.*?)
2021-07-10 17:02:47 19KB python scrapy 爬虫 数据采集
基于Python Scrapy实现的蜂鸟数据采集爬虫系统 含代理、日志处理和全部源代码等 import scrapy from fengniao.items import FengniaoItem from scrapy.spidermiddlewares.httperror import HttpError from twisted.internet.error import TimeoutError, TCPTimedOutError, DNSLookupError, ConnectionRefusedError class FengniaoclawerSpider(scrapy.Spider): name = 'fengniaoClawer' allowed_domains = ['fengniao.com'] # 爬虫自定义设置,会覆盖 settings.py 文件中的设置 custom_settings = { 'LOG_LEVEL': 'DEBUG', # 定义log等级 'DOWNLOAD_DELAY': 0, # 下载延时 'COOKIES_ENABLED': False, # enabled by default 'DEFAULT_REQUEST_HEADERS': { # 'Host': 'www.fengniao.com', 'Referer': 'https://www.fengniao.com', }, # 管道文件,优先级按照由小到大依次进入 'ITEM_PIPELINES': { 'fengniao.pipelines.ImagePipeline': 100, 'fengniao.pipelines.FengniaoPipeline': 300, }, # 关于下载图片部分 'IMAGES_STORE': 'fengniaoPhoto', # 没有则新建 'IMAGES_EXPIRES': 90, # 图片有效期,已经存在的图片在这个时间段内不会再下载 'IMAGES_MIN_HEIGHT': 100, # 图片最小尺寸(高度),低于这个高度的图片不会下载 'IMAGES_MIN_WIDTH': 100, # 图片最小尺寸(宽度),低于这个宽度的图片不会下载 # 下载中间件,优先级按照由小到大依次进入 'DOWNLOADER_MIDDLEWARES': { 'fengniao.middlewares.ProxiesMiddleware': 400, 'fengniao.middlewares.HeadersMiddleware': 543, 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, }, 'DEPTH_PRIORITY': 1, # BFS,是以starts_url为准,局部BFS,受CONCURRENT_REQUESTS影响 'SCHEDULER_DISK_QUEUE': 'scrapy.squeues.PickleFifoDiskQueue', 'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeues.FifoMemoryQueue', 'REDIRECT_PRIORITY_ADJUST': 2, # Default: +2 'RETRY_PRIORITY_ADJUST': -1, # Default: -1 'RETRY_TIMES': 8, # 重试次数 # Default: 2, can also be specified per-request using max_retry_times attribute of Request.meta 'DOWNLOAD_TIMEOUT': 30, # This timeout can be set per spider using download_timeout spider attribute and per-request using download_timeout Request.meta key # 'DUPEFILTER_CLASS': "scrapy_redis.dupefilter.RFPDupeFilter", # 'SCHEDULER': "scrapy_redis.scheduler.Scheduler", # 'SCHEDULER_PERSIST': False, # Don't cleanup red
2021-07-10 17:02:46 14KB python scrapy 爬虫 数据采集
基于Python Scrapy实现的豆瓣电影数据采集爬虫系统 含数据库SQL和全部源代码 # -*- coding: utf-8 -*- """ @Author : nesta @Email : 572645517@qq.com @Software: PyCharm @project : movie @File : MovieSpider.py @Time : 2018/4/26 9:18 """ from scrapy.spiders import Spider from scrapy.http import Request from scrapy.selector import Selector from movie.items import MovieItem class MovieSpider(Spider): name = 'movie' url = u'https://movie.douban.com/top250' start_urls = [u'https://movie.douban.com/top250'] def parse(self, response): item = MovieItem() selector = Selector(response) # 解析 movies = selector.xpath('//div[@class="info"]') for movie in movies: title = movie.xpath('div[@class="hd"]/a/span/text()').extract() fullTitle = '' for each in title: fullTitle += each movieInfo = movie.xpath('div[@class="bd"]/p/text()').extract() star = movie.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract()[0] quote = movie.xpath('div[@class="bd"]/p/span/text()').extract() if quote: quote = quote[0] else: quote = '' item['title'] = fullTitle item['movieInfo'] = ';'.join(movieInfo).replace(' ', '').replace('\n', '') item['star'] = star[0] item['quote'] = quote yield item nextPage = selector.xpath('//span[@class="next"]/link/@href').extract() if nextPage: nextPage = nextPage[0] print(self.url + str(nextPage)) yield Request(self.url + str(nextPage), callback=self.parse)
2021-07-10 17:02:46 14KB python scrapy 豆瓣电影 数据爬虫
基于Python Scrapy实现的腾讯招聘职位数据爬取爬虫系统 含结果数据集和全部源代码 # -*- coding: utf-8 -*- import scrapy from Tencent.items import TencentItem class TencentpostionSpider(scrapy.Spider): name = 'tencentPosition' allowed_domains = ['tencent.com'] url = "http://hr.tencent.com/position.php?&start=" offset = 0 # 起始url start_urls = [url + str(offset)] def parse(self, response): for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"): # 初始化模型对象 item = TencentItem() # 职位名称 item['positionname'] = each.xpath("./td[1]/a/text()").extract()[0] # 详情连接 item['positionlink'] = each.xpath("./td[1]/a/@href").extract()[0] # 职位类别 item['positionType'] = each.xpath("./td[2]/text()").extract()[0] # 招聘人数 item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0] # 工作地点 item['workLocation'] = each.xpath("./td[4]/text()").extract()[0] # 发布时间 item['publishTime'] = each.xpath("./td[5]/text()").extract()[0] yield item if self.offset < 1680: self.offset += 10 # 每次处理完一页的数据之后,重新发送下一页页面请求 # self.offset自增10,同时拼接为新的url,并调用回调函数self.parse处理Response yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
2021-07-10 17:02:45 15KB python scrapy 腾讯 招聘
基于python的网络爬虫系统的设计与实现.pdf
2021-06-28 20:04:27 1.51MB Python 程序 软件开发 论文期刊
基于Python 的网络爬虫系统.pdf
2021-06-28 20:04:07 1.42MB Python 程序 软件开发 论文期刊