Renamed scrapy project
This commit is contained in:
parent
68f7091ecb
commit
9845f6bbd2
10 changed files with 42 additions and 8 deletions
|
@ -1,4 +1,4 @@
|
||||||
# Scrapy settings for imgur project
|
# Scrapy settings for photo_scraper project
|
||||||
#
|
#
|
||||||
# For simplicity, this file contains only settings considered important or
|
# For simplicity, this file contains only settings considered important or
|
||||||
# commonly used. You can find more settings consulting the documentation:
|
# commonly used. You can find more settings consulting the documentation:
|
||||||
|
@ -7,10 +7,10 @@
|
||||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
BOT_NAME = 'imgur'
|
BOT_NAME = 'photo_scraper'
|
||||||
|
|
||||||
SPIDER_MODULES = ['imgur.spiders']
|
SPIDER_MODULES = ['photo_scraper.spiders']
|
||||||
NEWSPIDER_MODULE = 'imgur.spiders'
|
NEWSPIDER_MODULE = 'photo_scraper.spiders'
|
||||||
|
|
||||||
|
|
||||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
|
@ -60,13 +60,13 @@ CONCURRENT_REQUESTS = 256
|
||||||
# Enable or disable spider middlewares
|
# Enable or disable spider middlewares
|
||||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
#SPIDER_MIDDLEWARES = {
|
#SPIDER_MIDDLEWARES = {
|
||||||
# 'imgur.middlewares.ImgurSpiderMiddleware': 543,
|
# 'photo_scraper.middlewares.ImgurSpiderMiddleware': 543,
|
||||||
#}
|
#}
|
||||||
|
|
||||||
# Enable or disable downloader middlewares
|
# Enable or disable downloader middlewares
|
||||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
#DOWNLOADER_MIDDLEWARES = {
|
#DOWNLOADER_MIDDLEWARES = {
|
||||||
# 'imgur.middlewares.ImgurDownloaderMiddleware': 543,
|
# 'photo_scraper.middlewares.ImgurDownloaderMiddleware': 543,
|
||||||
#}
|
#}
|
||||||
|
|
||||||
# Enable or disable extensions
|
# Enable or disable extensions
|
||||||
|
@ -78,7 +78,7 @@ CONCURRENT_REQUESTS = 256
|
||||||
# Configure item pipelines
|
# Configure item pipelines
|
||||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
#ITEM_PIPELINES = {
|
#ITEM_PIPELINES = {
|
||||||
# 'imgur.pipelines.ImgurPipeline': 300,
|
# 'photo_scraper.pipelines.ImgurPipeline': 300,
|
||||||
#}
|
#}
|
||||||
|
|
||||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
34
photo_scraper/spiders/stock123rf.py
Normal file
34
photo_scraper/spiders/stock123rf.py
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
import scrapy
|
||||||
|
import re
|
||||||
|
from scrapy_splash import SplashRequest
|
||||||
|
import dateparser
|
||||||
|
|
||||||
|
class Stock123refSpider(scrapy.Spider):
|
||||||
|
name = "123rfscraper"
|
||||||
|
|
||||||
|
start_urls = ["https://www.flickr.com/photos/tags/train",
|
||||||
|
"https://www.flickr.com/photos/tags/tree",
|
||||||
|
"https://www.flickr.com/photos/tags/outside"]
|
||||||
|
|
||||||
|
def parse_photo(self, response):
|
||||||
|
adesc = re.compile("<h2 class=\" meta-field photo-desc \">(.*)<\\/h2>", re.DOTALL)
|
||||||
|
ad = re.compile("(\\w+\\s*\\d+\\s*,\\s*\\d+)", re.MULTILINE)
|
||||||
|
|
||||||
|
photo = {}
|
||||||
|
photo["author"] = response.css("a.owner-name::text").get().strip(' \n\t')
|
||||||
|
photo["title"] = response.css("h1::text").get().strip(' \n\t')
|
||||||
|
|
||||||
|
photo["description"] = ''.join(adesc.findall(response.text)).strip(' \n\t')
|
||||||
|
date = response.css("span.date-taken-label::text").get()
|
||||||
|
date = ad.findall(date)
|
||||||
|
date = date[0]
|
||||||
|
|
||||||
|
photo["date"] = dateparser.parse(date)
|
||||||
|
if photo["date"] is not None:
|
||||||
|
photo["date"] = photo["date"].strftime("%Y-%m-%d")
|
||||||
|
else:
|
||||||
|
photo["date"] = ''
|
||||||
|
photo["img_url"] = "https:" + response.css("img.main-photo::attr('src')").get()
|
||||||
|
photo["tags"] = response.css("ul.tags-list li a[title]::text").getall()
|
||||||
|
yield photo
|
||||||
|
|
|
@ -2,4 +2,4 @@
|
||||||
|
|
||||||
source ./venv/bin/activate.sh
|
source ./venv/bin/activate.sh
|
||||||
|
|
||||||
scrapy runspider imgur/spiders/explore.py -o scraped/photos.csv
|
scrapy runspider photo_scraper/spiders/flickr.py -o scraped/flickr.csv
|
||||||
|
|
Reference in a new issue