本文共 6747 字,大约阅读时间需要 22 分钟。
这是一个使用scrapy的ImagesPipeline爬取下载图片的示例,生成的图片保存在爬虫的full文件夹里。
scrapy startproject DoubanImgs
cd DoubanImgs
scrapy genspider download_douban douban.com
vim spiders/download_douban.py
# coding=utf-8from scrapy.spiders import Spiderimport refrom scrapy import Requestfrom ..items import DoubanImgsItemclass download_douban(Spider): name = 'download_douban' default_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch, br', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'www.douban.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', } def __init__(self, url='1638835355', *args, **kwargs): self.allowed_domains = ['douban.com'] self.start_urls = [] for i in xrange(23): if i == 0: page_url = 'http://www.douban.com/photos/album/' + url else: page_url = 'http://www.douban.com/photos/album/' + url + '/?start=' + str(i*18) self.start_urls.append(page_url) self.url = url # call the father base function # super(download_douban, self).__init__(*args, **kwargs) def start_requests(self): for url in self.start_urls: yield Request(url=url, headers=self.default_headers, callback=self.parse) def parse(self, response): list_imgs = response.xpath('//div[@class="photolst clearfix"]//img/@src').extract() if list_imgs: item = DoubanImgsItem() item['image_urls'] = list_imgs yield item
vim settings.py
# -*- coding: utf-8 -*-# Scrapy settings for DoubanImgs project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:## https://doc.scrapy.org/en/latest/topics/settings.html# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html# https://doc.scrapy.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'DoubanImgs'SPIDER_MODULES = ['DoubanImgs.spiders']NEWSPIDER_MODULE = 'DoubanImgs.spiders'ITEM_PIPELINES = { 'DoubanImgs.pipelines.DoubanImgDownloadPipeline': 300,}IMAGES_STORE = '.'IMAGES_EXPIRES = 90# Crawl responsibly by identifying yourself (and your website) on the user-agent#USER_AGENT = 'DoubanImgs (+http://www.yourdomain.com)'# Obey robots.txt rulesROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and docsDOWNLOAD_DELAY = 0.5# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False# Override the default request headers:#DEFAULT_REQUEST_HEADERS = {# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',# 'Accept-Language': 'en',#}# Enable or disable spider middlewares# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {# 'DoubanImgs.middlewares.DoubanimgsSpiderMiddleware': 543,#}# Enable or disable downloader middlewares# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#DOWNLOADER_MIDDLEWARES = {# 'DoubanImgs.middlewares.DoubanimgsDownloaderMiddleware': 543,#}# Enable or disable extensions# See https://doc.scrapy.org/en/latest/topics/extensions.html#EXTENSIONS = {# 'scrapy.extensions.telnet.TelnetConsole': None,#}# Configure item pipelines# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html#ITEM_PIPELINES = {# 'DoubanImgs.pipelines.DoubanimgsPipeline': 300,#}# Enable and configure the AutoThrottle extension (disabled by default)# See https://doc.scrapy.org/en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = 'httpcache'#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
vim items.py
# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# https://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyfrom scrapy import Fieldclass DoubanImgsItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() image_urls = Field() images = Field() image_paths = Field()
vim pipelines.py
# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlfrom scrapy.pipelines.images import ImagesPipelinefrom scrapy.exceptions import DropItemfrom scrapy import Requestfrom scrapy import logclass DoubanImgsPipeline(object): def process_item(self, item, spider): return itemclass DoubanImgDownloadPipeline(ImagesPipeline): default_headers = { 'accept': 'image/webp,image/*,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, sdch, br', 'accept-language': 'zh-CN,zh;q=0.8,en;q=0.6', 'cookie': 'bid=yQdC/AzTaCw', 'referer': 'https://www.douban.com/photos/photo/2370443040/', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', } def get_media_requests(self, item, info): for image_url in item['image_urls']: self.default_headers['referer'] = image_url yield Request(image_url, headers=self.default_headers) def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") item['image_paths'] = image_paths return item