import scrapy import re from scrapy_splash import SplashRequest import dateparser def clean(s): if s is None: return "" else: return s.strip("\t\r\n ") class ShutterstockSpider(scrapy.Spider): name = "shutterstockscraper" start_urls = ["https://www.shutterstock.com/featured-collections/archive"] def parse(self, response): image_links = response.css('a[data-automation=mosaic-grid-cell-anchor]::attr("href")').getall() for link in image_links: yield response.follow(link, self.parse_photo) a = response.css('div[data-automation=GridCard_card_container] a::attr("href")').getall() for link in a: yield response.follow(link, self.parse) def parse_photo(self, response): video = {"author": clean(response.css('a[data-automation=AssetDetails_contributorLink]::text').get()), "title": clean(response.css('h1::text').get()), "description": clean(response.css('div[data-automation=ImageDetailsPage_ImageSize_Dropdown] p::text').get()), "date": "", "img_url": response.css('div[role=presentation] img::attr("src")').get(), "tags": response.css('div[data-automation=ExpandableKeywordsList] a::text').getall()} if len(video["title"]) > 0: yield video