Added Shutterstock crawler
This commit is contained in:
parent
81dacae2ea
commit
fab188aba7
7 changed files with 5847 additions and 74421 deletions
33
photo_scraper/spiders/shutterstock.py
Normal file
33
photo_scraper/spiders/shutterstock.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
import scrapy
|
||||
import re
|
||||
from scrapy_splash import SplashRequest
|
||||
import dateparser
|
||||
|
||||
def clean(s):
|
||||
if s is None:
|
||||
return ""
|
||||
else:
|
||||
return s.strip("\t\r\n ")
|
||||
|
||||
class ShutterstockSpider(scrapy.Spider):
|
||||
name = "shutterstockscraper"
|
||||
|
||||
start_urls = ["https://www.shutterstock.com/featured-collections/archive"]
|
||||
|
||||
def parse(self, response):
|
||||
image_links = response.css('a[data-automation=mosaic-grid-cell-anchor]::attr("href")').getall()
|
||||
for link in image_links:
|
||||
yield response.follow(link, self.parse_photo)
|
||||
a = response.css('div[data-automation=GridCard_card_container] a::attr("href")').getall()
|
||||
for link in a:
|
||||
yield response.follow(link, self.parse)
|
||||
|
||||
def parse_photo(self, response):
|
||||
video = {"author": clean(response.css('a[data-automation=AssetDetails_contributorLink]::text').get()),
|
||||
"title": clean(response.css('h1::text').get()),
|
||||
"description": clean(response.css('div[data-automation=ImageDetailsPage_ImageSize_Dropdown] p::text').get()),
|
||||
"date": "",
|
||||
"img_url": response.css('div[role=presentation] img::attr("src")').get(),
|
||||
"tags": response.css('div[data-automation=ExpandableKeywordsList] a::text').getall()}
|
||||
if len(video["title"]) > 0:
|
||||
yield video
|
|
@ -3,3 +3,5 @@
|
|||
source ./venv/bin/activate.sh
|
||||
|
||||
scrapy runspider photo_scraper/spiders/flickr.py -o scraped/flickr.csv
|
||||
scrapy runspider photo_scraper/spiders/stock123rf.py -o scraped/123rf.csv
|
||||
scrapy runspider photo_scraper/spiders/shutterstock.py -o scraped/shutterstock.csv
|
||||
|
|
|
@ -1,6 +0,0 @@
|
|||
#!/bin/sh
|
||||
|
||||
rm tosolr.csv
|
||||
touch tosolr.csv
|
||||
pv 123rf.csv | awk '{print NR-1 "," $0}' | tail -n +2 | awk 'BEGIN {print "id,t_author,t_title,t_description,date,img_url,tags"} {print}' >> tosolr.csv
|
||||
pv photos.csv | awk '{print NR-1 "," $0}' | tail -n +2 | awk 'BEGIN {print "id,t_author,t_title,t_description,date,img_url,tags"} {print}' >> tosolr.csv
|
5810
scraped/shutterstock.csv
Normal file
5810
scraped/shutterstock.csv
Normal file
File diff suppressed because it is too large
Load diff
74413
scraped/tosolr.csv
74413
scraped/tosolr.csv
File diff suppressed because one or more lines are too long
|
@ -29,7 +29,7 @@ solr/bin/solr start
|
|||
cd scraped
|
||||
|
||||
# POST scraped data
|
||||
cat photos.csv 123rf.csv | \
|
||||
cat photos.csv 123rf.csv shutterstock.csv | \
|
||||
awk '{print NR-1 "," $0}' | \
|
||||
tail -n +2 | \
|
||||
awk 'BEGIN {print "id,t_author,t_title,t_description,date,img_url,tags"} {print}' | \
|
||||
|
|
|
@ -36,7 +36,7 @@
|
|||
<a class="img_box" href="{{ url }}" target="_blank"
|
||||
style="background-image: url({{ url }})"/></a>
|
||||
<div class="description">
|
||||
<h2><a href="{{ site_url }}">{{ title }}</a></h2>
|
||||
<h2>{{ title }}</h2>
|
||||
<h3>By {{ author }}</h3>
|
||||
{{ description }}</div>
|
||||
</div>
|
||||
|
|
Reference in a new issue