This repository has been archived on 2020-12-10. You can view files and clone it, but cannot push or open issues or pull requests.
IRProject/photo_scraper/spiders/shutterstock.py
Claudio Maggioni (maggicl) fab188aba7 Added Shutterstock crawler
2020-12-01 22:32:45 +01:00

33 lines
1.3 KiB
Python

import scrapy
import re
from scrapy_splash import SplashRequest
import dateparser
def clean(s):
if s is None:
return ""
else:
return s.strip("\t\r\n ")
class ShutterstockSpider(scrapy.Spider):
name = "shutterstockscraper"
start_urls = ["https://www.shutterstock.com/featured-collections/archive"]
def parse(self, response):
image_links = response.css('a[data-automation=mosaic-grid-cell-anchor]::attr("href")').getall()
for link in image_links:
yield response.follow(link, self.parse_photo)
a = response.css('div[data-automation=GridCard_card_container] a::attr("href")').getall()
for link in a:
yield response.follow(link, self.parse)
def parse_photo(self, response):
video = {"author": clean(response.css('a[data-automation=AssetDetails_contributorLink]::text').get()),
"title": clean(response.css('h1::text').get()),
"description": clean(response.css('div[data-automation=ImageDetailsPage_ImageSize_Dropdown] p::text').get()),
"date": "",
"img_url": response.css('div[role=presentation] img::attr("src")').get(),
"tags": response.css('div[data-automation=ExpandableKeywordsList] a::text').getall()}
if len(video["title"]) > 0:
yield video