From 4edfcd99d7e8440383fce3678ed55823b77882e6 Mon Sep 17 00:00:00 2001 From: Claudio Maggioni Date: Tue, 10 Nov 2020 20:41:13 +0100 Subject: [PATCH] Added 123RF parser --- photo_scraper/spiders/stock123rf.py | 36 ++++++++++------------------- scrapy.cfg | 4 ++-- 2 files changed, 14 insertions(+), 26 deletions(-) diff --git a/photo_scraper/spiders/stock123rf.py b/photo_scraper/spiders/stock123rf.py index bb19357..41c8be4 100644 --- a/photo_scraper/spiders/stock123rf.py +++ b/photo_scraper/spiders/stock123rf.py @@ -6,29 +6,17 @@ import dateparser class Stock123refSpider(scrapy.Spider): name = "123rfscraper" - start_urls = ["https://www.flickr.com/photos/tags/train", - "https://www.flickr.com/photos/tags/tree", - "https://www.flickr.com/photos/tags/outside"] + start_urls = ["https://www.123rf.com/stock-photo/"] + + def parse(self, response): + links = response.css('.index-stockphoto-thumb-container a::attr("href")').getall() + for link in links: + yield response.follow(link, self.parse_photo_list) + + def parse_photo_list(self, response): + links = response.css('.mosaic-main-container a::attr("href")').getall() + for link in links: + yield response.follow(link, self.parse_photo) def parse_photo(self, response): - adesc = re.compile("

(.*)<\\/h2>", re.DOTALL) - ad = re.compile("(\\w+\\s*\\d+\\s*,\\s*\\d+)", re.MULTILINE) - - photo = {} - photo["author"] = response.css("a.owner-name::text").get().strip(' \n\t') - photo["title"] = response.css("h1::text").get().strip(' \n\t') - - photo["description"] = ''.join(adesc.findall(response.text)).strip(' \n\t') - date = response.css("span.date-taken-label::text").get() - date = ad.findall(date) - date = date[0] - - photo["date"] = dateparser.parse(date) - if photo["date"] is not None: - photo["date"] = photo["date"].strftime("%Y-%m-%d") - else: - photo["date"] = '' - photo["img_url"] = "https:" + response.css("img.main-photo::attr('src')").get() - photo["tags"] = response.css("ul.tags-list li a[title]::text").getall() - yield photo - + yield [response.css('title::text').get()] diff --git a/scrapy.cfg b/scrapy.cfg index c88feee..eafae31 100644 --- a/scrapy.cfg +++ b/scrapy.cfg @@ -4,8 +4,8 @@ # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] -default = imgur.settings +default = photo_scraper.settings [deploy] #url = http://localhost:6800/ -project = imgur +project = photo_scraper