From 4edfcd99d7e8440383fce3678ed55823b77882e6 Mon Sep 17 00:00:00 2001
From: Claudio Maggioni <maggicl@usi.ch>
Date: Tue, 10 Nov 2020 20:41:13 +0100
Subject: [PATCH] Added 123RF parser

---
 photo_scraper/spiders/stock123rf.py | 36 ++++++++++-------------------
 scrapy.cfg                          |  4 ++--
 2 files changed, 14 insertions(+), 26 deletions(-)
diff --git a/photo_scraper/spiders/stock123rf.py b/photo_scraper/spiders/stock123rf.py
index bb19357..41c8be4 100644
--- a/photo_scraper/spiders/stock123rf.py
+++ b/photo_scraper/spiders/stock123rf.py
@@ -6,29 +6,17 @@ import dateparser
 class Stock123refSpider(scrapy.Spider):
     name = "123rfscraper"
 
-    start_urls = ["https://www.flickr.com/photos/tags/train",
-                  "https://www.flickr.com/photos/tags/tree",
-                  "https://www.flickr.com/photos/tags/outside"]
+    start_urls = ["https://www.123rf.com/stock-photo/"]
+
+    def parse(self, response):
+        links = response.css('.index-stockphoto-thumb-container a::attr("href")').getall()
+        for link in links:
+            yield response.follow(link, self.parse_photo_list)
+
+    def parse_photo_list(self, response):
+        links = response.css('.mosaic-main-container a::attr("href")').getall()
+        for link in links:
+            yield response.follow(link, self.parse_photo)
 
     def parse_photo(self, response):
-        adesc = re.compile("<h2 class=\" meta-field photo-desc \">(.*)<\\/h2>", re.DOTALL)
-        ad = re.compile("(\\w+\\s*\\d+\\s*,\\s*\\d+)", re.MULTILINE)
-
-        photo = {}
-        photo["author"] = response.css("a.owner-name::text").get().strip(' \n\t')
-        photo["title"] = response.css("h1::text").get().strip(' \n\t')
-
-        photo["description"] = ''.join(adesc.findall(response.text)).strip(' \n\t')
-        date = response.css("span.date-taken-label::text").get()
-        date = ad.findall(date)
-        date = date[0]
-
-        photo["date"] = dateparser.parse(date)
-        if photo["date"] is not None:
-            photo["date"] = photo["date"].strftime("%Y-%m-%d")
-        else:
-            photo["date"] = ''
-        photo["img_url"] = "https:" + response.css("img.main-photo::attr('src')").get()
-        photo["tags"] = response.css("ul.tags-list li a[title]::text").getall()
-        yield photo
-
+        yield [response.css('title::text').get()]
diff --git a/scrapy.cfg b/scrapy.cfg
index c88feee..eafae31 100644
--- a/scrapy.cfg
+++ b/scrapy.cfg
@@ -4,8 +4,8 @@
 # https://scrapyd.readthedocs.io/en/latest/deploy.html
 
 [settings]
-default = imgur.settings
+default = photo_scraper.settings
 
 [deploy]
 #url = http://localhost:6800/
-project = imgur
+project = photo_scraper