Compare commits

...

2 Commits

Author SHA1 Message Date
Claudio Maggioni 4edfcd99d7 Added 123RF parser 2020-11-10 20:41:13 +01:00
Claudio Maggioni 9845f6bbd2 Renamed scrapy project 2020-11-10 18:36:41 +01:00
11 changed files with 32 additions and 10 deletions

View File

@ -1,4 +1,4 @@
# Scrapy settings for imgur project
# Scrapy settings for photo_scraper project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
@ -7,10 +7,10 @@
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'imgur'
BOT_NAME = 'photo_scraper'
SPIDER_MODULES = ['imgur.spiders']
NEWSPIDER_MODULE = 'imgur.spiders'
SPIDER_MODULES = ['photo_scraper.spiders']
NEWSPIDER_MODULE = 'photo_scraper.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
@ -60,13 +60,13 @@ CONCURRENT_REQUESTS = 256
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'imgur.middlewares.ImgurSpiderMiddleware': 543,
# 'photo_scraper.middlewares.ImgurSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'imgur.middlewares.ImgurDownloaderMiddleware': 543,
# 'photo_scraper.middlewares.ImgurDownloaderMiddleware': 543,
#}
# Enable or disable extensions
@ -78,7 +78,7 @@ CONCURRENT_REQUESTS = 256
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'imgur.pipelines.ImgurPipeline': 300,
# 'photo_scraper.pipelines.ImgurPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)

View File

@ -0,0 +1,22 @@
import scrapy
import re
from scrapy_splash import SplashRequest
import dateparser
class Stock123refSpider(scrapy.Spider):
name = "123rfscraper"
start_urls = ["https://www.123rf.com/stock-photo/"]
def parse(self, response):
links = response.css('.index-stockphoto-thumb-container a::attr("href")').getall()
for link in links:
yield response.follow(link, self.parse_photo_list)
def parse_photo_list(self, response):
links = response.css('.mosaic-main-container a::attr("href")').getall()
for link in links:
yield response.follow(link, self.parse_photo)
def parse_photo(self, response):
yield [response.css('title::text').get()]

View File

@ -2,4 +2,4 @@
source ./venv/bin/activate.sh
scrapy runspider imgur/spiders/explore.py -o scraped/photos.csv
scrapy runspider photo_scraper/spiders/flickr.py -o scraped/flickr.csv

View File

@ -4,8 +4,8 @@
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = imgur.settings
default = photo_scraper.settings
[deploy]
#url = http://localhost:6800/
project = imgur
project = photo_scraper