Added Shutterstock crawler

2020-12-01 22:32:45 +01:00 · 2020-12-01 22:32:45 +01:00 · fab188aba7
commit fab188aba7
parent 81dacae2ea
7 changed files with 5847 additions and 74421 deletions
--- a/photo_scraper/spiders/shutterstock.py
+++ b/photo_scraper/spiders/shutterstock.py
@ -0,0 +1,33 @@
+import scrapy
+import re
+from scrapy_splash import SplashRequest
+import dateparser
+
+def clean(s):
+    if s is None:
+        return ""
+    else:
+        return s.strip("\t\r\n ")
+
+class ShutterstockSpider(scrapy.Spider):
+    name = "shutterstockscraper"
+
+    start_urls = ["https://www.shutterstock.com/featured-collections/archive"]
+
+    def parse(self, response):
+        image_links = response.css('a[data-automation=mosaic-grid-cell-anchor]::attr("href")').getall()
+        for link in image_links:
+            yield response.follow(link, self.parse_photo)
+        a = response.css('div[data-automation=GridCard_card_container] a::attr("href")').getall()
+        for link in a:
+            yield response.follow(link, self.parse)
+
+    def parse_photo(self, response):
+        video = {"author": clean(response.css('a[data-automation=AssetDetails_contributorLink]::text').get()),
+                "title": clean(response.css('h1::text').get()),
+                "description": clean(response.css('div[data-automation=ImageDetailsPage_ImageSize_Dropdown] p::text').get()),
+                "date": "",
+                "img_url": response.css('div[role=presentation] img::attr("src")').get(),
+                "tags": response.css('div[data-automation=ExpandableKeywordsList] a::text').getall()}
+        if len(video["title"]) > 0:
+            yield video
--- a/scrape.sh
+++ b/scrape.sh
@ -3,3 +3,5 @@
 source ./venv/bin/activate.sh

 scrapy runspider photo_scraper/spiders/flickr.py -o scraped/flickr.csv
+scrapy runspider photo_scraper/spiders/stock123rf.py -o scraped/123rf.csv
+scrapy runspider photo_scraper/spiders/shutterstock.py -o scraped/shutterstock.csv
--- a/scraped/before_solr.sh
+++ b/scraped/before_solr.sh
@ -1,6 +0,0 @@
-#!/bin/sh
-
-rm tosolr.csv
-touch tosolr.csv
-pv 123rf.csv | awk '{print NR-1 "," $0}' | tail -n +2 | awk 'BEGIN {print "id,t_author,t_title,t_description,date,img_url,tags"} {print}' >> tosolr.csv
-pv photos.csv | awk '{print NR-1 "," $0}' | tail -n +2 | awk 'BEGIN {print "id,t_author,t_title,t_description,date,img_url,tags"} {print}' >> tosolr.csv
--- a/scraped/shutterstock.csv
+++ b/scraped/shutterstock.csv
--- a/scraped/tosolr.csv
+++ b/scraped/tosolr.csv
--- a/solr_install.sh
+++ b/solr_install.sh
@ -29,7 +29,7 @@ solr/bin/solr start
 cd scraped

 # POST scraped data
-cat photos.csv 123rf.csv | \
+cat photos.csv 123rf.csv shutterstock.csv | \
 	awk '{print NR-1 "," $0}' | \
 	tail -n +2 | \
 	awk 'BEGIN {print "id,t_author,t_title,t_description,date,img_url,tags"} {print}' | \
--- a/ui/index.html
+++ b/ui/index.html
@ -36,7 +36,7 @@
          <a class="img_box" href="{{ url }}" target="_blank"
            style="background-image: url({{ url }})"/></a>
          <div class="description">
-          <h2><a href="{{ site_url }}">{{ title }}</a></h2>
+          <h2>{{ title }}</h2>
          <h3>By {{ author }}</h3>
          {{ description }}</div>
        </div>