Hi Stackoverflow Community
I have the following problem:
I am trying to crawl a long list of websites. Some of the websites in the start_url list redirect (301). I want scrapy to crawl the redirected websites from start_url list as if they were also on the allowed_domain list (which they are not). For example, example.com was on my start_url list and allowed domain list and example.com redirects to foo.com. I want to crawl foo.com.
DEBUG: Redirecting (301) to <GET http://www.foo.com/> from <GET http://www.example.com>
I noticed the following response Scrapy Crawl all websites in start_url even if redirect which provides a solution by modifying the OffsiteMiddleware. That part I understand but I am unsure about how the parse_start_url is overridden. This is my code so far:
import scrapy
import urllib.request
import urllib.parse
import json
from placementarchitect import bingapi
import tldextract
from spiderproject.items import DmozItem
from scrapy.crawler import CrawlerProcess
class GoodSpider(scrapy.Spider):
name = "goodoldSpider"
def __init__(self, input=None):
self.searchterm = input
self.urlstocrawl = bingapi.get_crawl_urls(self.searchterm) # This returns a list of crawlable sites from the BingSearchAPI
self.start_urls = self.urlstocrawl
self.allowed_domains = []
def parse_start_url(self, response):
domain = tldextract.extract(str(response.request.url)).registered_domain
if domain not in self.allowed_domains:
self.allowed_domains.append(domain)
else:
return self.parse(response)
def parse(self, response):
for href in response.xpath("//a/@href"):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//div[attribute::class="cat-item"]'):
item = DmozItem()
item['title'] = sel.xpath('a/div/text()').extract()
item['link'] = sel.xpath('a/@href').extract()
item['desc'] = sel.xpath('text()').extract()
yield item
next_page = response.css(".cat-item>a::attr('href')")
if next_page:
url = response.urljoin(next_page[0].extract())
yield scrapy.Request(url, self.parse_dir_contents)
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(GoodSpider, input='"good news"')
process.start() # the script will block here until the crawling is finished
The scrapy documentation is sparse on parse_start_url, so I am not sure how this would be implemented. As such my solution doesn't seem to work.
I am afraid this is due to how
def parse_start_url()
is implemented.
Any advice would be highly appreciated.
Mike