Added Shutterstock crawler
This commit is contained in:
parent
81dacae2ea
commit
fab188aba7
7 changed files with 5847 additions and 74421 deletions
33
photo_scraper/spiders/shutterstock.py
Normal file
33
photo_scraper/spiders/shutterstock.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
import scrapy
|
||||||
|
import re
|
||||||
|
from scrapy_splash import SplashRequest
|
||||||
|
import dateparser
|
||||||
|
|
||||||
|
def clean(s):
|
||||||
|
if s is None:
|
||||||
|
return ""
|
||||||
|
else:
|
||||||
|
return s.strip("\t\r\n ")
|
||||||
|
|
||||||
|
class ShutterstockSpider(scrapy.Spider):
|
||||||
|
name = "shutterstockscraper"
|
||||||
|
|
||||||
|
start_urls = ["https://www.shutterstock.com/featured-collections/archive"]
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
image_links = response.css('a[data-automation=mosaic-grid-cell-anchor]::attr("href")').getall()
|
||||||
|
for link in image_links:
|
||||||
|
yield response.follow(link, self.parse_photo)
|
||||||
|
a = response.css('div[data-automation=GridCard_card_container] a::attr("href")').getall()
|
||||||
|
for link in a:
|
||||||
|
yield response.follow(link, self.parse)
|
||||||
|
|
||||||
|
def parse_photo(self, response):
|
||||||
|
video = {"author": clean(response.css('a[data-automation=AssetDetails_contributorLink]::text').get()),
|
||||||
|
"title": clean(response.css('h1::text').get()),
|
||||||
|
"description": clean(response.css('div[data-automation=ImageDetailsPage_ImageSize_Dropdown] p::text').get()),
|
||||||
|
"date": "",
|
||||||
|
"img_url": response.css('div[role=presentation] img::attr("src")').get(),
|
||||||
|
"tags": response.css('div[data-automation=ExpandableKeywordsList] a::text').getall()}
|
||||||
|
if len(video["title"]) > 0:
|
||||||
|
yield video
|
|
@ -3,3 +3,5 @@
|
||||||
source ./venv/bin/activate.sh
|
source ./venv/bin/activate.sh
|
||||||
|
|
||||||
scrapy runspider photo_scraper/spiders/flickr.py -o scraped/flickr.csv
|
scrapy runspider photo_scraper/spiders/flickr.py -o scraped/flickr.csv
|
||||||
|
scrapy runspider photo_scraper/spiders/stock123rf.py -o scraped/123rf.csv
|
||||||
|
scrapy runspider photo_scraper/spiders/shutterstock.py -o scraped/shutterstock.csv
|
||||||
|
|
|
@ -1,6 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
|
|
||||||
rm tosolr.csv
|
|
||||||
touch tosolr.csv
|
|
||||||
pv 123rf.csv | awk '{print NR-1 "," $0}' | tail -n +2 | awk 'BEGIN {print "id,t_author,t_title,t_description,date,img_url,tags"} {print}' >> tosolr.csv
|
|
||||||
pv photos.csv | awk '{print NR-1 "," $0}' | tail -n +2 | awk 'BEGIN {print "id,t_author,t_title,t_description,date,img_url,tags"} {print}' >> tosolr.csv
|
|
5810
scraped/shutterstock.csv
Normal file
5810
scraped/shutterstock.csv
Normal file
File diff suppressed because it is too large
Load diff
74413
scraped/tosolr.csv
74413
scraped/tosolr.csv
File diff suppressed because one or more lines are too long
|
@ -29,7 +29,7 @@ solr/bin/solr start
|
||||||
cd scraped
|
cd scraped
|
||||||
|
|
||||||
# POST scraped data
|
# POST scraped data
|
||||||
cat photos.csv 123rf.csv | \
|
cat photos.csv 123rf.csv shutterstock.csv | \
|
||||||
awk '{print NR-1 "," $0}' | \
|
awk '{print NR-1 "," $0}' | \
|
||||||
tail -n +2 | \
|
tail -n +2 | \
|
||||||
awk 'BEGIN {print "id,t_author,t_title,t_description,date,img_url,tags"} {print}' | \
|
awk 'BEGIN {print "id,t_author,t_title,t_description,date,img_url,tags"} {print}' | \
|
||||||
|
|
|
@ -36,7 +36,7 @@
|
||||||
<a class="img_box" href="{{ url }}" target="_blank"
|
<a class="img_box" href="{{ url }}" target="_blank"
|
||||||
style="background-image: url({{ url }})"/></a>
|
style="background-image: url({{ url }})"/></a>
|
||||||
<div class="description">
|
<div class="description">
|
||||||
<h2><a href="{{ site_url }}">{{ title }}</a></h2>
|
<h2>{{ title }}</h2>
|
||||||
<h3>By {{ author }}</h3>
|
<h3>By {{ author }}</h3>
|
||||||
{{ description }}</div>
|
{{ description }}</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
Reference in a new issue