Added 123RF parser

Renamed scrapy project
2020-11-10 20:41:13 +01:00 · 2020-11-10 18:36:41 +01:00
11 changed files with 32 additions and 10 deletions
--- a/photo_scraper/init.py
+++ b/photo_scraper/init.py
--- a/photo_scraper/items.py
+++ b/photo_scraper/items.py
--- a/photo_scraper/middlewares.py
+++ b/photo_scraper/middlewares.py
--- a/photo_scraper/pipelines.py
+++ b/photo_scraper/pipelines.py
--- a/photo_scraper/settings.py
+++ b/photo_scraper/settings.py
@ -1,4 +1,4 @@
-# Scrapy settings for imgur project
+# Scrapy settings for photo_scraper project
 #
 # For simplicity, this file contains only settings considered important or
 # commonly used. You can find more settings consulting the documentation:
@ -7,10 +7,10 @@
 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

-BOT_NAME = 'imgur'
+BOT_NAME = 'photo_scraper'

-SPIDER_MODULES = ['imgur.spiders']
-NEWSPIDER_MODULE = 'imgur.spiders'
+SPIDER_MODULES = ['photo_scraper.spiders']
+NEWSPIDER_MODULE = 'photo_scraper.spiders'


 # Crawl responsibly by identifying yourself (and your website) on the user-agent
@ -60,13 +60,13 @@ CONCURRENT_REQUESTS = 256
 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 #SPIDER_MIDDLEWARES = {
-#    'imgur.middlewares.ImgurSpiderMiddleware': 543,
+#    'photo_scraper.middlewares.ImgurSpiderMiddleware': 543,
 #}

 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #DOWNLOADER_MIDDLEWARES = {
-#    'imgur.middlewares.ImgurDownloaderMiddleware': 543,
+#    'photo_scraper.middlewares.ImgurDownloaderMiddleware': 543,
 #}

 # Enable or disable extensions
@ -78,7 +78,7 @@ CONCURRENT_REQUESTS = 256
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 #ITEM_PIPELINES = {
-#    'imgur.pipelines.ImgurPipeline': 300,
+#    'photo_scraper.pipelines.ImgurPipeline': 300,
 #}

 # Enable and configure the AutoThrottle extension (disabled by default)
--- a/photo_scraper/spiders/init.py
+++ b/photo_scraper/spiders/init.py
--- a/photo_scraper/spiders/flickr.py
+++ b/photo_scraper/spiders/flickr.py
--- a/photo_scraper/spiders/infinite_scroll.lua
+++ b/photo_scraper/spiders/infinite_scroll.lua
--- a/photo_scraper/spiders/stock123rf.py
+++ b/photo_scraper/spiders/stock123rf.py
@ -0,0 +1,22 @@
+import scrapy
+import re
+from scrapy_splash import SplashRequest
+import dateparser
+
+class Stock123refSpider(scrapy.Spider):
+    name = "123rfscraper"
+
+    start_urls = ["https://www.123rf.com/stock-photo/"]
+
+    def parse(self, response):
+        links = response.css('.index-stockphoto-thumb-container a::attr("href")').getall()
+        for link in links:
+            yield response.follow(link, self.parse_photo_list)
+
+    def parse_photo_list(self, response):
+        links = response.css('.mosaic-main-container a::attr("href")').getall()
+        for link in links:
+            yield response.follow(link, self.parse_photo)
+
+    def parse_photo(self, response):
+        yield [response.css('title::text').get()]
--- a/scrape.sh
+++ b/scrape.sh
@ -2,4 +2,4 @@

 source ./venv/bin/activate.sh

-scrapy runspider imgur/spiders/explore.py -o scraped/photos.csv
+scrapy runspider photo_scraper/spiders/flickr.py -o scraped/flickr.csv
--- a/scrapy.cfg
+++ b/scrapy.cfg
@ -4,8 +4,8 @@
 # https://scrapyd.readthedocs.io/en/latest/deploy.html

 [settings]
-default = imgur.settings
+default = photo_scraper.settings

 [deploy]
 #url = http://localhost:6800/
-project = imgur
+project = photo_scraper
Author	SHA1	Message	Date
Claudio Maggioni	4edfcd99d7	Added 123RF parser	2020-11-10 20:41:13 +01:00
Claudio Maggioni	9845f6bbd2	Renamed scrapy project	2020-11-10 18:36:41 +01:00