Added Shutterstock crawler

This commit is contained in:
Claudio Maggioni (maggicl) 2020-12-01 22:32:45 +01:00
parent 81dacae2ea
commit fab188aba7
7 changed files with 5847 additions and 74421 deletions

View file

@ -0,0 +1,33 @@
import scrapy
import re
from scrapy_splash import SplashRequest
import dateparser
def clean(s):
if s is None:
return ""
else:
return s.strip("\t\r\n ")
class ShutterstockSpider(scrapy.Spider):
name = "shutterstockscraper"
start_urls = ["https://www.shutterstock.com/featured-collections/archive"]
def parse(self, response):
image_links = response.css('a[data-automation=mosaic-grid-cell-anchor]::attr("href")').getall()
for link in image_links:
yield response.follow(link, self.parse_photo)
a = response.css('div[data-automation=GridCard_card_container] a::attr("href")').getall()
for link in a:
yield response.follow(link, self.parse)
def parse_photo(self, response):
video = {"author": clean(response.css('a[data-automation=AssetDetails_contributorLink]::text').get()),
"title": clean(response.css('h1::text').get()),
"description": clean(response.css('div[data-automation=ImageDetailsPage_ImageSize_Dropdown] p::text').get()),
"date": "",
"img_url": response.css('div[role=presentation] img::attr("src")').get(),
"tags": response.css('div[data-automation=ExpandableKeywordsList] a::text').getall()}
if len(video["title"]) > 0:
yield video

View file

@ -3,3 +3,5 @@
source ./venv/bin/activate.sh source ./venv/bin/activate.sh
scrapy runspider photo_scraper/spiders/flickr.py -o scraped/flickr.csv scrapy runspider photo_scraper/spiders/flickr.py -o scraped/flickr.csv
scrapy runspider photo_scraper/spiders/stock123rf.py -o scraped/123rf.csv
scrapy runspider photo_scraper/spiders/shutterstock.py -o scraped/shutterstock.csv

View file

@ -1,6 +0,0 @@
#!/bin/sh
rm tosolr.csv
touch tosolr.csv
pv 123rf.csv | awk '{print NR-1 "," $0}' | tail -n +2 | awk 'BEGIN {print "id,t_author,t_title,t_description,date,img_url,tags"} {print}' >> tosolr.csv
pv photos.csv | awk '{print NR-1 "," $0}' | tail -n +2 | awk 'BEGIN {print "id,t_author,t_title,t_description,date,img_url,tags"} {print}' >> tosolr.csv

5810
scraped/shutterstock.csv Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

View file

@ -29,7 +29,7 @@ solr/bin/solr start
cd scraped cd scraped
# POST scraped data # POST scraped data
cat photos.csv 123rf.csv | \ cat photos.csv 123rf.csv shutterstock.csv | \
awk '{print NR-1 "," $0}' | \ awk '{print NR-1 "," $0}' | \
tail -n +2 | \ tail -n +2 | \
awk 'BEGIN {print "id,t_author,t_title,t_description,date,img_url,tags"} {print}' | \ awk 'BEGIN {print "id,t_author,t_title,t_description,date,img_url,tags"} {print}' | \

View file

@ -36,7 +36,7 @@
<a class="img_box" href="{{ url }}" target="_blank" <a class="img_box" href="{{ url }}" target="_blank"
style="background-image: url({{ url }})"/></a> style="background-image: url({{ url }})"/></a>
<div class="description"> <div class="description">
<h2><a href="{{ site_url }}">{{ title }}</a></h2> <h2>{{ title }}</h2>
<h3>By {{ author }}</h3> <h3>By {{ author }}</h3>
{{ description }}</div> {{ description }}</div>
</div> </div>