IRProject/photo_scraper/spiders/shutterstock.py

import scrapy
import re
from scrapy_splash import SplashRequest
import dateparser

def clean(s):
    if s is None:
        return ""
    else:
        return s.strip("\t\r\n ")

class ShutterstockSpider(scrapy.Spider):
    name = "shutterstockscraper"

    start_urls = ["https://www.shutterstock.com/featured-collections/archive"]

    def parse(self, response):
        image_links = response.css('a[data-automation=mosaic-grid-cell-anchor]::attr("href")').getall()
        for link in image_links:
            yield response.follow(link, self.parse_photo)
        a = response.css('div[data-automation=GridCard_card_container] a::attr("href")').getall()
        for link in a:
            yield response.follow(link, self.parse)

    def parse_photo(self, response):
        video = {"author": clean(response.css('a[data-automation=AssetDetails_contributorLink]::text').get()),
                "title": clean(response.css('h1::text').get()),
                "description": clean(response.css('div[data-automation=ImageDetailsPage_ImageSize_Dropdown] p::text').get()),
                "date": "",
                "img_url": response.css('div[role=presentation] img::attr("src")').get(),
                "tags": response.css('div[data-automation=ExpandableKeywordsList] a::text').getall()}
        if len(video["title"]) > 0:
            yield video