1

This is my program to download images through image pipeline. It works well and download images but the problem ** is it rename images in sha1 hash after that I am unable to identify them. Can there be any solution so that I can use the **model_name as of the images to be download?

   import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from selenium import webdriver
from urlparse import urljoin
import time

class CompItem(scrapy.Item):
    model_name = scrapy.Field()
    images = scrapy.Field()
    image_urls = scrapy.Field()
    image_name = scrapy.Field()

class criticspider(CrawlSpider):
    name = "buysmaart_images"
    allowed_domains = ["http://buysmaart.com/"]
    start_urls = ["http://buysmaart.com/productdetails/550/Samsung-Galaxy-Note-4",  "http://buysmaart.com/productdetails/115/HTC-One-M8-Eye",  "http://buysmaart.com/productdetails/506/OPPO-N1",  "http://buysmaart.com/productdetails/342/LG-G2-D802T"]

    def __init__(self, *args, **kwargs):
        super(criticspider, self).__init__(*args, **kwargs)
        self.download_delay = 0.25
        self.browser = webdriver.Firefox()
        self.browser.implicitly_wait(2)

    def parse_start_url(self, response):
        self.browser.get(response.url)
        time.sleep(8)
        sel = Selector(text=self.browser.page_source)
        item = CompItem()

        photos = sel.xpath('//ul[contains(@id,"productImageUl")]/li')
        print len(photos)
        all_photo_urls = []
        for photo in photos:
            item['image_name'] = sel.xpath('.//h3[contains(@class,"ng-binding")]/text()').extract()[0].encode('ascii','ignore')
            #tmp_url = photo.xpath('.//img/@src').extract()[0].encode('ascii','ignore')
            image_url = photo.xpath('.//img/@src').extract()[0]
            all_photo_urls.append(image_url)
            item['image_urls'] = all_photo_urls
        yield item

pipeline

    from scrapy.contrib.pipeline.images import DownloadImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class DownloadImagesPipeline(object):
    def process_item(self, item, spider):
         def get_media_requests(self, item, info):
        return [Request(x, meta={'image_names': item["image_name"]})
                for x in item.get('image_urls', [])]

def get_images(self, response, request, info):
    for key, image, buf, in super(DownloadImagesPipeline, self).get_images(response, request, info):
        if re.compile('^[0-9,a-f]+.jpg$').match(key):
            key = self.change_filename(key, response)
        yield key, image, buf

def change_filename(self, key, response):
    return "%s.jpg" % response.meta['image_name'][0]

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['image_paths'] = image_paths
        return item

settings

BOT_NAME = 'download_images'

SPIDER_MODULES = ['download_images.spiders']
NEWSPIDER_MODULE = 'download_images.spiders'
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
IMAGES_STORE= '/home/john/Desktop/download_images/31_jul'
John Dene
  • 550
  • 1
  • 7
  • 21
  • Possible duplicate of [Scrapy: customize Image pipeline with renaming defualt image name](https://stackoverflow.com/questions/18081997/scrapy-customize-image-pipeline-with-renaming-defualt-image-name) – Gallaecio May 27 '19 at 06:58

3 Answers3

3

Scrapy 1.3.3 solution(override image_downloaded methods):

import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.utils.misc import md5sum
class MyImagesPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url, meta={'image_names': item["image_names"]})

    def image_downloaded(self, response, request, info):
        checksum = None
        for path, image, buf in self.get_images(response, request, info):
            if checksum is None:
                buf.seek(0)
                checksum = md5sum(buf)
            width, height = image.size
            path = 'full/%s' % response.meta['image_names'][0] # **Here Changed**
            self.store.persist_file(
                path, buf, info,
                meta={'width': width, 'height': height},
                headers={'Content-Type': 'image/jpeg'})
        return checksum
Devin
  • 139
  • 1
  • 7
1

The solution is to override the image_key method of your DownloadImagesPipeline class.

def image_key(self, url):
    return 'image_name.here'

For example if you want the image name of the URL you can use

url.split('/')[-1]

as the name of the image. Note that this method is deprecated and can be removed in a future release.

Alternatively you can set the image_name for your image in your Spider:

item['image_name'] = ['whatever_you_want']

In this case you have to extend your pipeline a bit more to utilize the name of the image you provided:

def get_media_requests(self, item, info):
        return [Request(x, meta={'image_names': item["image_name"]})
                for x in item.get('image_urls', [])]

def get_images(self, response, request, info):
    for key, image, buf, in super(DownloadImagesPipeline, self).get_images(response, request, info):
        if re.compile('^[0-9,a-f]+.jpg$').match(key):
            key = self.change_filename(key, response)
        yield key, image, buf

def change_filename(self, key, response):
    return "%s.jpg" % response.meta['image_name'][0]

And of course your pipeline should extend ImagesPipeline.

GHajba
  • 3,665
  • 5
  • 25
  • 35
  • I have update my spider and pipeline still not giving me names – John Dene Aug 03 '15 at 06:53
  • Did you include your image pipeline in your settings too? – GHajba Aug 03 '15 at 07:15
  • `ITEM_PIPELINES = ['download_images.pipelines.DownloadImagesPipeline']` or the path to your `DownloadImagesPipeline` class wherever it is. – GHajba Aug 03 '15 at 07:17
  • NameError: Module 'scrapy.contrib.pipeline.images' doesn't define any object named 'DownloadImagesPipeline' getting this error – John Dene Aug 03 '15 at 07:38
  • How did you update the settings? You know, you have to add the path to your class -- which is not in `scrapy.contrib` but in your project. – GHajba Aug 03 '15 at 07:49
  • I have updated the pipeline can you help me what I am doing wrong? – John Dene Aug 03 '15 at 08:00
  • Let us [continue this discussion in chat](http://chat.stackoverflow.com/rooms/84983/discussion-between-ghajba-and-john-dene). – GHajba Aug 03 '15 at 08:15
0

It will give answer for custom image names as well as to which folder (custom named) such images to be saved.

#spider.py

import scrapy
from ..items import DusharaItem
class DusharaSpider(scrapy.Spider):
    name='dushara'
    start_urls=['https://www.indiaglitz.com/dushara-photos-tamil-actress-3129970-8771']
    def parse(self,response):
        selector = response.xpath('//div[@class="gallmain gallerycontainer-8771"]/div[@class="gallery_detail gal-8771"]')
        for sel in selector:       
            item = DusharaItem()
            item['image_urls']      = sel.xpath('./img/@src').extract_first()
            #item['image_urls']     = [sel.xpath('./img/@src').extract_first()]  # for default scraping process
            item['folder_names_1']  = 'Actress'
            item['folder_names_2']  = 'Tamil'
            item['image_names']     = sel.xpath('./img/@src').extract_first().split('/')[-1] # it should contain image extension like .jpg
            yield item

#items.py

import scrapy
class DusharaItem(scrapy.Item):
    image_urls     = scrapy.Field()
    images         = scrapy.Field()
    folder_names_1 = scrapy.Field()
    folder_names_2 = scrapy.Field()
    image_names    = scrapy.Field()


#pipelines.py

import scrapy
from scrapy.pipelines.images import ImagesPipeline
class DusharaPipeline(ImagesPipeline):
    def get_media_requests(self, item,info):
        url            = item['image_urls']
        folder_names_1 = item['folder_names_1']
        folder_names_2 = item['folder_names_2']
        image_names    = item['image_names']
        yield scrapy.Request(url=url, meta={'folder_names_1': folder_names_1, 'folder_names_2': folder_names_2, 'image_names': image_names})
    
    def file_path(self, request, response=None, info=None, *, item=None):
        folder_names_1 = request.meta['folder_names_1']
        folder_names_2 = request.meta['folder_names_2']
        image_names    = request.meta['image_names']
        return '/'+folder_names_1+'/'+folder_names_2+'/'+image_names

#settings.py

ITEM_PIPELINES  = {'dushara.pipelines.DusharaPipeline': 300}
#ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1} # for default scraping process
IMAGES_STORE    = r'D:\Scraped'