1234567891011121314151617181920212223242526272829303132333435 |
- # -*- coding: utf-8 -*-
- import scrapy
- from scrapy.crawler import CrawlerProcess
- from myFirstSpider.items import MyfirstspiderItem # 引入item
- class NgaspiderSpider(scrapy.Spider):
- name = 'labSpider'
- start_urls = ['http://lab.scrapyd.cn/']
- def parse(self, response):
- sentence = response.css('div.quote') # css选择器选择所有句子
- item = MyfirstspiderItem() # 实例化引入的类
- for v in sentence: # 循环获取每个句子里面的:内容、作者、标签
- item['content'] = v.css('.text::text').extract_first() # 提取名言
- item['author'] = v.css('.author::text').extract_first() # 提取作者
- tags = v.css('.tags .tag::text').extract() # 提取标签
- item['tag'] = ','.join(tags) # 转化为字符串
- yield item
-
- # fileName = '%s-语录.txt' % autor # 定义文件名
- # with open(fileName, "a+") as f: # 不同人的名言保存在不同的txt文档,“a+”以追加的形式
- # f.write(text)
- # f.write('\n') # 写入 \n 以达到换行的效果
- # f.write('标签:' + tags)
- # f.write('\n-------\n')
- # f.close()
-
- next_page = response.css('li.next a::attr(href)').extract_first()
- if next_page != None:
- next_page = response.urljoin(next_page) # 使用urljoin方法将相对路径转换为绝对路径
- yield scrapy.Request(next_page, callback=self.parse)
|