33 lines
1.3 KiB
Python
33 lines
1.3 KiB
Python
import scrapy
|
|
import re
|
|
from scrapy_splash import SplashRequest
|
|
import dateparser
|
|
|
|
def clean(s):
|
|
if s is None:
|
|
return ""
|
|
else:
|
|
return s.strip("\t\r\n ")
|
|
|
|
class ShutterstockSpider(scrapy.Spider):
|
|
name = "shutterstockscraper"
|
|
|
|
start_urls = ["https://www.shutterstock.com/featured-collections/archive"]
|
|
|
|
def parse(self, response):
|
|
image_links = response.css('a[data-automation=mosaic-grid-cell-anchor]::attr("href")').getall()
|
|
for link in image_links:
|
|
yield response.follow(link, self.parse_photo)
|
|
a = response.css('div[data-automation=GridCard_card_container] a::attr("href")').getall()
|
|
for link in a:
|
|
yield response.follow(link, self.parse)
|
|
|
|
def parse_photo(self, response):
|
|
video = {"author": clean(response.css('a[data-automation=AssetDetails_contributorLink]::text').get()),
|
|
"title": clean(response.css('h1::text').get()),
|
|
"description": clean(response.css('div[data-automation=ImageDetailsPage_ImageSize_Dropdown] p::text').get()),
|
|
"date": "",
|
|
"img_url": response.css('div[role=presentation] img::attr("src")').get(),
|
|
"tags": response.css('div[data-automation=ExpandableKeywordsList] a::text').getall()}
|
|
if len(video["title"]) > 0:
|
|
yield video
|