Renamed scrapy project

2020-11-10 18:36:41 +01:00 · 2020-11-10 18:36:41 +01:00 · 9845f6bbd2
commit 9845f6bbd2
parent 68f7091ecb
10 changed files with 42 additions and 8 deletions
--- a/photo_scraper/init.py
+++ b/photo_scraper/init.py
--- a/photo_scraper/items.py
+++ b/photo_scraper/items.py
--- a/photo_scraper/middlewares.py
+++ b/photo_scraper/middlewares.py
--- a/photo_scraper/pipelines.py
+++ b/photo_scraper/pipelines.py
--- a/photo_scraper/settings.py
+++ b/photo_scraper/settings.py
@ -1,4 +1,4 @@
-# Scrapy settings for imgur project
+# Scrapy settings for photo_scraper project
 #
 # For simplicity, this file contains only settings considered important or
 # commonly used. You can find more settings consulting the documentation:
@ -7,10 +7,10 @@
 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-BOT_NAME = 'imgur'
+BOT_NAME = 'photo_scraper'
-SPIDER_MODULES = ['imgur.spiders']
+SPIDER_MODULES = ['photo_scraper.spiders']
-NEWSPIDER_MODULE = 'imgur.spiders'
+NEWSPIDER_MODULE = 'photo_scraper.spiders'
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
@ -60,13 +60,13 @@ CONCURRENT_REQUESTS = 256
 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 #SPIDER_MIDDLEWARES = {
-#    'imgur.middlewares.ImgurSpiderMiddleware': 543,
+#    'photo_scraper.middlewares.ImgurSpiderMiddleware': 543,
 #}
 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #DOWNLOADER_MIDDLEWARES = {
-#    'imgur.middlewares.ImgurDownloaderMiddleware': 543,
+#    'photo_scraper.middlewares.ImgurDownloaderMiddleware': 543,
 #}
 # Enable or disable extensions
@ -78,7 +78,7 @@ CONCURRENT_REQUESTS = 256
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 #ITEM_PIPELINES = {
-#    'imgur.pipelines.ImgurPipeline': 300,
+#    'photo_scraper.pipelines.ImgurPipeline': 300,
 #}
 # Enable and configure the AutoThrottle extension (disabled by default)
--- a/photo_scraper/spiders/init.py
+++ b/photo_scraper/spiders/init.py
--- a/photo_scraper/spiders/flickr.py
+++ b/photo_scraper/spiders/flickr.py
--- a/photo_scraper/spiders/infinite_scroll.lua
+++ b/photo_scraper/spiders/infinite_scroll.lua
--- a/photo_scraper/spiders/stock123rf.py
+++ b/photo_scraper/spiders/stock123rf.py
@ -0,0 +1,34 @@
 import scrapy
 import re
 from scrapy_splash import SplashRequest
 import dateparser
 class Stock123refSpider(scrapy.Spider):
    name = "123rfscraper"
    start_urls = ["https://www.flickr.com/photos/tags/train",
                  "https://www.flickr.com/photos/tags/tree",
                  "https://www.flickr.com/photos/tags/outside"]
    def parse_photo(self, response):
        adesc = re.compile("<h2 class=\" meta-field photo-desc \">(.*)<\\/h2>", re.DOTALL)
        ad = re.compile("(\\w+\\s*\\d+\\s*,\\s*\\d+)", re.MULTILINE)
        photo = {}
        photo["author"] = response.css("a.owner-name::text").get().strip(' \n\t')
        photo["title"] = response.css("h1::text").get().strip(' \n\t')
        photo["description"] = ''.join(adesc.findall(response.text)).strip(' \n\t')
        date = response.css("span.date-taken-label::text").get()
        date = ad.findall(date)
        date = date[0]
        photo["date"] = dateparser.parse(date)
        if photo["date"] is not None:
            photo["date"] = photo["date"].strftime("%Y-%m-%d")
        else:
            photo["date"] = ''
        photo["img_url"] = "https:" + response.css("img.main-photo::attr('src')").get()
        photo["tags"] = response.css("ul.tags-list li a[title]::text").getall()
        yield photo
--- a/scrape.sh
+++ b/scrape.sh
@ -2,4 +2,4 @@
 source ./venv/bin/activate.sh
-scrapy runspider imgur/spiders/explore.py -o scraped/photos.csv
+scrapy runspider photo_scraper/spiders/flickr.py -o scraped/flickr.csv
`@ -2,4 +2,4 @@`

	`source ./venv/bin/activate.sh`	`source ./venv/bin/activate.sh`

	`scrapy runspider imgur/spiders/explore.py -o scraped/photos.csv`	`scrapy runspider photo_scraper/spiders/flickr.py -o scraped/flickr.csv`