基于Python Scrapy实现的网易云音乐music163数据爬取爬虫系统 含全部源代码
基于Scrapy框架的网易云音乐爬虫,大致爬虫流程如下:
- 以歌手页为索引页,抓取到全部歌手;
- 从全部歌手页抓取到全部专辑;
- 通过所有专辑抓取到所有歌曲;
- 最后抓取歌曲的精彩评论。
数据保存到`Mongodb`数据库,保存歌曲的歌手,歌名,专辑,和热评的作者,赞数,以及作者头像url。
抓取评论者的头像url,是因为如果大家喜欢,可以将他做web端。
### 运行:
```
$ scrapy crawl music
```
#!/usr/bin/python
#-*-coding:utf-8-*-
import time
from pprint import pprint
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from woaidu_crawler.items import WoaiduCrawlerItem
from woaidu_crawler.utils.select_result import list_first_item,strip_null,deduplication,clean_url
class WoaiduSpider(BaseSpider):
name = "woaidu"
start_urls = (
'http://www.woaidu.org/sitemap_1.html',
)
def parse(self,response):
response_selector = HtmlXPathSelector(response)
next_link = list_first_item(response_selector.select(u'//div[@class="k2"]/div/a[text()="下一页"]/@href').extract())
if next_link:
next_link = clean_url(response.url,next_link,response.encoding)
yield Request(url=next_link, callback=self.parse)
for detail_link in response_selector.select(u'//div[contains(@class,"sousuolist")]/a/@href').extract():
if detail_link:
detail_link = clean_url(response.url,detail_link,response.encoding)
yield Request(url=detail_link, callback=self.parse_detail)
def parse_detail(self, response):
woaidu_item = WoaiduCrawlerItem()
response_selector = HtmlXPathSelector(response)
woaidu_item['book_name'] = list_first_item(response_selector.select('//div[@class="zizida"][1]/text()').extract())
woaidu_item['author'] = [list_first_item(response_selector.select('//div[@class="xiaoxiao"][1]/text()').extract())[5:].strip(),]
woaidu_item['book_description'] = list_first_item(response_selector.select('//div[@class="lili"][1]/text()').extract()).strip()
woaidu_item['book_covor_image_url'] = list