notice
概览
import scrapy
class QuotesSpider(scrapy.Spider):
# 作为spider的标记,项目内不能重复
name = "quotes"
# 起始地址方式1
start_urls = [
'http://quotes.toscrape.com/tag/humor/',
]
# 起始地址方式2
def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
# 解析方式1,response是TextResponse类型
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').extract_first(),
'author': quote.css('small.author::text').extract_first(),
'tags': quote.css('div.tags a.tag::text').extract(),
}
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
next_page = response.css('li.next a::attr(href)').extract_first()
if next_page is not None:
# 请求方法1,Request不支持相对地址,需要urljoin预处理
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
# 请求方法2,follow支持相对地址,不需要urljoin
# follow 中第一个参数可以传递selecotr, 而且直接传递a 的时候,会自动提取href
yield response.follow(next_page, self.parse)
# 解析方式2
def parse(self, response):
# response对象
response.url
.body
.follow
.urljoin
scrapy.Request安装
第一个spider(爬虫)
存储
更多
Last updated