IRProject/photo_scraper/spiders/shutterstock.py

import scrapy
import re
from scrapy_splash import SplashRequest
import dateparser

def clean(s):
    if s is None:
        return ""
    else:
        return s.strip("\t\r\n ")

class ShutterstockSpider(scrapy.Spider):
    name = "shutterstockscraper"

    start_urls = ["https://www.shutterstock.com/featured-collections/archive"]

    def parse(self, response):
        image_links = response.css('a[data-automation=mosaic-grid-cell-anchor]::attr("href")').getall()
        for link in image_links:
            yield response.follow(link, self.parse_photo)
        a = response.css('div[data-automation=GridCard_card_container] a::attr("href")').getall()
        for link in a:
            yield response.follow(link, self.parse)

    def parse_photo(self, response):
        video = {"author": clean(response.css('a[data-automation=AssetDetails_contributorLink]::text').get()),
                "title": clean(response.css('h1::text').get()),
                "description": clean(response.css('div[data-automation=ImageDetailsPage_ImageSize_Dropdown] p::text').get()),
                "date": "",
                "img_url": response.css('div[role=presentation] img::attr("src")').get(),
                "tags": response.css('div[data-automation=ExpandableKeywordsList] a::text').getall()}
        if len(video["title"]) > 0:
            yield video
Added Shutterstock crawler 2020-12-01 21:32:45 +00:00			`import scrapy`
			`import re`
			`from scrapy_splash import SplashRequest`
			`import dateparser`

			`def clean(s):`
			`if s is None:`
			`return ""`
			`else:`
			`return s.strip("\t\r\n ")`

			`class ShutterstockSpider(scrapy.Spider):`
			`name = "shutterstockscraper"`

			`start_urls = ["https://www.shutterstock.com/featured-collections/archive"]`

			`def parse(self, response):`
			`image_links = response.css('a[data-automation=mosaic-grid-cell-anchor]::attr("href")').getall()`
			`for link in image_links:`
			`yield response.follow(link, self.parse_photo)`
			`a = response.css('div[data-automation=GridCard_card_container] a::attr("href")').getall()`
			`for link in a:`
			`yield response.follow(link, self.parse)`

			`def parse_photo(self, response):`
			`video = {"author": clean(response.css('a[data-automation=AssetDetails_contributorLink]::text').get()),`
			`"title": clean(response.css('h1::text').get()),`
			`"description": clean(response.css('div[data-automation=ImageDetailsPage_ImageSize_Dropdown] p::text').get()),`
			`"date": "",`
			`"img_url": response.css('div[role=presentation] img::attr("src")').get(),`
			`"tags": response.css('div[data-automation=ExpandableKeywordsList] a::text').getall()}`
			`if len(video["title"]) > 0:`
			`yield video`