This is my program to download images through image pipeline. It works well and download images but the problem ** is it rename images in sha1 hash after that I am unable to identify them. Can there be any solution so that I can use the **model_name as of the images to be download?
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from selenium import webdriver
from urlparse import urljoin
import time
class CompItem(scrapy.Item):
model_name = scrapy.Field()
images = scrapy.Field()
image_urls = scrapy.Field()
image_name = scrapy.Field()
class criticspider(CrawlSpider):
name = "buysmaart_images"
allowed_domains = ["http://buysmaart.com/"]
start_urls = ["http://buysmaart.com/productdetails/550/Samsung-Galaxy-Note-4", "http://buysmaart.com/productdetails/115/HTC-One-M8-Eye", "http://buysmaart.com/productdetails/506/OPPO-N1", "http://buysmaart.com/productdetails/342/LG-G2-D802T"]
def __init__(self, *args, **kwargs):
super(criticspider, self).__init__(*args, **kwargs)
self.download_delay = 0.25
self.browser = webdriver.Firefox()
self.browser.implicitly_wait(2)
def parse_start_url(self, response):
self.browser.get(response.url)
time.sleep(8)
sel = Selector(text=self.browser.page_source)
item = CompItem()
photos = sel.xpath('//ul[contains(@id,"productImageUl")]/li')
print len(photos)
all_photo_urls = []
for photo in photos:
item['image_name'] = sel.xpath('.//h3[contains(@class,"ng-binding")]/text()').extract()[0].encode('ascii','ignore')
#tmp_url = photo.xpath('.//img/@src').extract()[0].encode('ascii','ignore')
image_url = photo.xpath('.//img/@src').extract()[0]
all_photo_urls.append(image_url)
item['image_urls'] = all_photo_urls
yield item
pipeline
from scrapy.contrib.pipeline.images import DownloadImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class DownloadImagesPipeline(object):
def process_item(self, item, spider):
def get_media_requests(self, item, info):
return [Request(x, meta={'image_names': item["image_name"]})
for x in item.get('image_urls', [])]
def get_images(self, response, request, info):
for key, image, buf, in super(DownloadImagesPipeline, self).get_images(response, request, info):
if re.compile('^[0-9,a-f]+.jpg$').match(key):
key = self.change_filename(key, response)
yield key, image, buf
def change_filename(self, key, response):
return "%s.jpg" % response.meta['image_name'][0]
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
settings
BOT_NAME = 'download_images'
SPIDER_MODULES = ['download_images.spiders']
NEWSPIDER_MODULE = 'download_images.spiders'
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
IMAGES_STORE= '/home/john/Desktop/download_images/31_jul'