Added Shutterstock crawler

2020-12-01 22:32:45 +01:00 · 2020-12-01 22:32:45 +01:00 · fab188aba7
commit fab188aba7
parent 81dacae2ea
7 changed files with 5847 additions and 74421 deletions
--- a/photo_scraper/spiders/shutterstock.py
+++ b/photo_scraper/spiders/shutterstock.py
@ -0,0 +1,33 @@
 import scrapy
 import re
 from scrapy_splash import SplashRequest
 import dateparser
 def clean(s):
    if s is None:
        return ""
    else:
        return s.strip("\t\r\n ")
 class ShutterstockSpider(scrapy.Spider):
    name = "shutterstockscraper"
    start_urls = ["https://www.shutterstock.com/featured-collections/archive"]
    def parse(self, response):
        image_links = response.css('a[data-automation=mosaic-grid-cell-anchor]::attr("href")').getall()
        for link in image_links:
            yield response.follow(link, self.parse_photo)
        a = response.css('div[data-automation=GridCard_card_container] a::attr("href")').getall()
        for link in a:
            yield response.follow(link, self.parse)
    def parse_photo(self, response):
        video = {"author": clean(response.css('a[data-automation=AssetDetails_contributorLink]::text').get()),
                "title": clean(response.css('h1::text').get()),
                "description": clean(response.css('div[data-automation=ImageDetailsPage_ImageSize_Dropdown] p::text').get()),
                "date": "",
                "img_url": response.css('div[role=presentation] img::attr("src")').get(),
                "tags": response.css('div[data-automation=ExpandableKeywordsList] a::text').getall()}
        if len(video["title"]) > 0:
            yield video
--- a/scrape.sh
+++ b/scrape.sh
@ -3,3 +3,5 @@
 source ./venv/bin/activate.sh
 scrapy runspider photo_scraper/spiders/flickr.py -o scraped/flickr.csv
 scrapy runspider photo_scraper/spiders/stock123rf.py -o scraped/123rf.csv
 scrapy runspider photo_scraper/spiders/shutterstock.py -o scraped/shutterstock.csv
--- a/scraped/before_solr.sh
+++ b/scraped/before_solr.sh
@ -1,6 +0,0 @@
 #!/bin/sh
 rm tosolr.csv
 touch tosolr.csv
 pv 123rf.csv | awk '{print NR-1 "," $0}' | tail -n +2 | awk 'BEGIN {print "id,t_author,t_title,t_description,date,img_url,tags"} {print}' >> tosolr.csv
 pv photos.csv | awk '{print NR-1 "," $0}' | tail -n +2 | awk 'BEGIN {print "id,t_author,t_title,t_description,date,img_url,tags"} {print}' >> tosolr.csv
--- a/scraped/shutterstock.csv
+++ b/scraped/shutterstock.csv
--- a/scraped/tosolr.csv
+++ b/scraped/tosolr.csv
--- a/solr_install.sh
+++ b/solr_install.sh
@ -29,7 +29,7 @@ solr/bin/solr start
 cd scraped
 # POST scraped data
-cat photos.csv 123rf.csv | \
+cat photos.csv 123rf.csv shutterstock.csv | \
 	awk '{print NR-1 "," $0}' | \
 	tail -n +2 | \
 	awk 'BEGIN {print "id,t_author,t_title,t_description,date,img_url,tags"} {print}' | \
--- a/ui/index.html
+++ b/ui/index.html
@ -36,7 +36,7 @@
          <a class="img_box" href="{{ url }}" target="_blank"
            style="background-image: url({{ url }})"/></a>
          <div class="description">
-          <h2><a href="{{ site_url }}">{{ title }}</a></h2>
+          <h2>{{ title }}</h2>
          <h3>By {{ author }}</h3>
          {{ description }}</div>
        </div>