ngaSpider.py 1.5 KB

1234567891011121314151617181920212223242526272829303132333435
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. from scrapy.crawler import CrawlerProcess
  4. from myFirstSpider.items import MyfirstspiderItem # 引入item
  5. class NgaspiderSpider(scrapy.Spider):
  6. name = 'labSpider'
  7. start_urls = ['http://lab.scrapyd.cn/']
  8. def parse(self, response):
  9. sentence = response.css('div.quote') # css选择器选择所有句子
  10. item = MyfirstspiderItem() # 实例化引入的类
  11. for v in sentence: # 循环获取每个句子里面的:内容、作者、标签
  12. item['content'] = v.css('.text::text').extract_first() # 提取名言
  13. item['author'] = v.css('.author::text').extract_first() # 提取作者
  14. tags = v.css('.tags .tag::text').extract() # 提取标签
  15. item['tag'] = ','.join(tags) # 转化为字符串
  16. yield item
  17. # fileName = '%s-语录.txt' % autor # 定义文件名
  18. # with open(fileName, "a+") as f: # 不同人的名言保存在不同的txt文档,“a+”以追加的形式
  19. # f.write(text)
  20. # f.write('\n') # 写入 \n 以达到换行的效果
  21. # f.write('标签:' + tags)
  22. # f.write('\n-------\n')
  23. # f.close()
  24. next_page = response.css('li.next a::attr(href)').extract_first()
  25. if next_page != None:
  26. next_page = response.urljoin(next_page) # 使用urljoin方法将相对路径转换为绝对路径
  27. yield scrapy.Request(next_page, callback=self.parse)