This repository has been archived on 2020-12-10. You can view files and clone it, but cannot push or open issues or pull requests.
IRProject/photo_scraper/spiders/stock123rf.py
Claudio Maggioni (maggicl) fb92add03f Crawled 123RF
2020-11-11 14:58:58 +01:00

36 lines
1.4 KiB
Python

import scrapy
import re
from scrapy_splash import SplashRequest
import dateparser
class Stock123refSpider(scrapy.Spider):
name = "123rfscraper"
start_urls = ["https://www.123rf.com/stock-photo/"]
def parse(self, response):
links = response.css('.index-stockphoto-thumb-container a::attr("href")').getall()
for link in links:
yield response.follow(link, self.parse_photo_list)
def parse_photo_list(self, response):
links = response.css('.mosaic-main-container a::attr("href")').getall()
for link in links:
yield response.follow(link, self.parse_photo)
a = response.css('#btn_main_nextpg::attr("href")').get()
if a is not None:
yield response.follow(a, self.parse_photo_list)
def parse_photo(self, response):
if response.css('h1::text').get() is None:
return
a = response.css('#contributorPortfolioLink::text').get()
b = response.css("#imageDescriptionText::text").get()
yield {"author": "" if a is None else a.strip("\t\r\n "),
"title": response.css('h1::text').get().strip("\t\r\n "),
"description": "" if b is None else b.strip("\t\r\n "),
"date": "",
"img_url": response.css("picture source::attr('srcset')").get(),
"tags": response.css('.keywords-container a div.ui.label::text').getall()}