Done scraper

2020-11-04 15:36:32 +01:00 · 2020-11-04 15:36:32 +01:00 · 68f7091ecb
commit 68f7091ecb
12 changed files with 24889 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,138 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
--- a/imgur/init.py
+++ b/imgur/init.py
--- a/imgur/items.py
+++ b/imgur/items.py
@ -0,0 +1,12 @@
 # Define here the models for your scraped items
 #
 # See documentation in:
 # https://docs.scrapy.org/en/latest/topics/items.html
 import scrapy
 class ImgurItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass
--- a/imgur/middlewares.py
+++ b/imgur/middlewares.py
@ -0,0 +1,103 @@
 # Define here the models for your spider middleware
 #
 # See documentation in:
 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 from scrapy import signals
 # useful for handling different item types with a single interface
 from itemadapter import is_item, ItemAdapter
 class ImgurSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.
        # Should return None or raise an exception.
        return None
    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.
        # Must return an iterable of Request, or item objects.
        for i in result:
            yield i
    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.
        # Should return either None or an iterable of Request or item objects.
        pass
    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.
        # Must return only requests (not items).
        for r in start_requests:
            yield r
    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)
 class ImgurDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.
        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None
    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.
        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response
    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.
        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass
    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)
--- a/imgur/pipelines.py
+++ b/imgur/pipelines.py
@ -0,0 +1,13 @@
 # Define your item pipelines here
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 # useful for handling different item types with a single interface
 from itemadapter import ItemAdapter
 class ImgurPipeline:
    def process_item(self, item, spider):
        return item
--- a/imgur/settings.py
+++ b/imgur/settings.py
@ -0,0 +1,103 @@
 # Scrapy settings for imgur project
 #
 # For simplicity, this file contains only settings considered important or
 # commonly used. You can find more settings consulting the documentation:
 #
 #     https://docs.scrapy.org/en/latest/topics/settings.html
 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 BOT_NAME = 'imgur'
 SPIDER_MODULES = ['imgur.spiders']
 NEWSPIDER_MODULE = 'imgur.spiders'
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:82.0) Gecko/20100101 Firefox/82.0'
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = False
 SPLASH_URL = 'http://localhost:8050'
 DOWNLOADER_MIDDLEWARES = {
    'scrapy_splash.SplashCookiesMiddleware': 723,
    'scrapy_splash.SplashMiddleware': 725,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
 }
 SPIDER_MIDDLEWARES = {
    'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
 }
 DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
 HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 CONCURRENT_REQUESTS = 256
 # Configure a delay for requests for the same website (default: 0)
 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
 #DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
 #COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
 # Override the default request headers:
 #DEFAULT_REQUEST_HEADERS = {
 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 #   'Accept-Language': 'en',
 #}
 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 #SPIDER_MIDDLEWARES = {
 #    'imgur.middlewares.ImgurSpiderMiddleware': 543,
 #}
 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #DOWNLOADER_MIDDLEWARES = {
 #    'imgur.middlewares.ImgurDownloaderMiddleware': 543,
 #}
 # Enable or disable extensions
 # See https://docs.scrapy.org/en/latest/topics/extensions.html
 #EXTENSIONS = {
 #    'scrapy.extensions.telnet.TelnetConsole': None,
 #}
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 #ITEM_PIPELINES = {
 #    'imgur.pipelines.ImgurPipeline': 300,
 #}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
 #AUTOTHROTTLE_ENABLED = True
 # The initial download delay
 #AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
 #AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
 #AUTOTHROTTLE_DEBUG = False
 # Enable and configure HTTP caching (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 #HTTPCACHE_ENABLED = True
 #HTTPCACHE_EXPIRATION_SECS = 0
 #HTTPCACHE_DIR = 'httpcache'
 #HTTPCACHE_IGNORE_HTTP_CODES = []
 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/imgur/spiders/init.py
+++ b/imgur/spiders/init.py
@ -0,0 +1,4 @@
 # This package will contain the spiders of your Scrapy project
 #
 # Please refer to the documentation for information on how to create and manage
 # your spiders.
--- a/imgur/spiders/explore.py
+++ b/imgur/spiders/explore.py
@ -0,0 +1,70 @@
 import scrapy
 import re
 from scrapy_splash import SplashRequest
 import dateparser
 class ExploreSpider(scrapy.Spider):
    name = "explorescraper"
    terms = ['usi', 'eth', 'epfl', 'lugano', 'zurich', 'basel', 'oliveto%20lario', 'homework', 'plagiarism', 'rhb', 'sbb',
             'ship', 'frighten', 'fool', 'have', 'inspire', 'heal', 'master', 'terminate', 'amend', 'scratch', 'embark',
             'entail', 'execute', 'consolidate', 'cash', 'round', 'isolate', 'warrant', 'signal', 'weaken', 'pin',
             'march', 'desire', 'widen', 'level', 'chat', 'board', 'contend', 'invent', 'resource', 'manufacture', 'seal',
             'reconsider', 'suck', 'picture', 'crash', 'transport', 'plug', 'assign', 'enquire', 'campaign', 'trap',
             'surround', 'debate', 'upgrade', 'decorate', 'confer', 'accumulate', 'profit', 'file', 'inherit', 'disrupt',
             'contrast', 'chuck', 'tick', 'plead', 'dip', 'subscribe', 'educate', 'divorce', 'spin', 'row', 'obscure',
             'creep', 'interest', 'overlook', 'twist', 'mature', 'blend', 'revise', 'attribute', 'explode', 'dwell',
             'drown', 'alleviate', 'strip', 'grade', 'revert', 'value', 'award', 'strive', 'notify', 'remedy', 'accuse',
             'instruct', 'spill', 'strain', 'comprehend', 'soften', 'postpone', 'wave', 'bounce', 'stock', 'position',
             'insure', 'adhere', 'cling', 'summon','pause','empty','classify']
    infinite_scroll_urls = ["https://flickr.com/explore"]
    start_urls = ["https://www.flickr.com/photos/tags/train",
                  "https://www.flickr.com/photos/tags/tree",
                  "https://www.flickr.com/photos/tags/outside"]
    def parse_photo(self, response):
        adesc = re.compile("<h2 class=\" meta-field photo-desc \">(.*)<\\/h2>", re.DOTALL)
        ad = re.compile("(\\w+\\s*\\d+\\s*,\\s*\\d+)", re.MULTILINE)
        photo = {}
        photo["author"] = response.css("a.owner-name::text").get().strip(' \n\t')
        photo["title"] = response.css("h1::text").get().strip(' \n\t')
        photo["description"] = ''.join(adesc.findall(response.text)).strip(' \n\t')
        date = response.css("span.date-taken-label::text").get()
        date = ad.findall(date)
        date = date[0]
        photo["date"] = dateparser.parse(date)
        if photo["date"] is not None:
            photo["date"] = photo["date"].strftime("%Y-%m-%d")
        else:
            photo["date"] = ''
        photo["img_url"] = "https:" + response.css("img.main-photo::attr('src')").get()
        photo["tags"] = response.css("ul.tags-list li a[title]::text").getall()
        yield photo
    def start_requests(self):
        for term in self.terms:
            self.infinite_scroll_urls.append("https://www.flickr.com/search/?text=" + term)
        for url in self.infinite_scroll_urls:
            # Load Splash lua script to scroll the page a few times
            script = ""
            with open("infinite_scroll.lua", "r") as f:
                script = f.read()
            # Build Splash request
            yield SplashRequest(url=url, callback=self.parse, endpoint='execute', args={
                    "wait": 3,
                    "timeout": 90,
                    "lua_source": script,
                    "headers": {'User-Agent': "Mozilla/5.0 (X11; Linux x86_64; rv:82.0) Gecko/20100101 Firefox/82.0"}
                })
    def parse(self, response):
        for q in response.css(".photo-list-photo-interaction"):
            link = q.css("a::attr('href')").get()
            yield response.follow(link, self.parse_photo)
--- a/imgur/spiders/infinite_scroll.lua
+++ b/imgur/spiders/infinite_scroll.lua
@ -0,0 +1,18 @@
 function main(splash)
    local num_scrolls = 20
    local scroll_delay = 0.8
    local scroll_to = splash:jsfunc("window.scrollTo")
    local get_body_height = splash:jsfunc(
        "function() {return document.body.scrollHeight;}"
    )
    assert(splash:go(splash.args.url))
    splash:wait(splash.args.wait)
    for _ = 1, num_scrolls do
        scroll_to(0, get_body_height())
        splash:wait(scroll_delay)
    end
    return splash:html()
 end
--- a/scrape.sh
+++ b/scrape.sh
@ -0,0 +1,5 @@
 #!/bin/bash
 source ./venv/bin/activate.sh
 scrapy runspider imgur/spiders/explore.py -o scraped/photos.csv
--- a/scraped/photos.csv
+++ b/scraped/photos.csv
--- a/scrapy.cfg
+++ b/scrapy.cfg
@ -0,0 +1,11 @@
 # Automatically created by: scrapy startproject
 #
 # For more information about the [deploy] section see:
 # https://scrapyd.readthedocs.io/en/latest/deploy.html
 [settings]
 default = imgur.settings
 [deploy]
 #url = http://localhost:6800/
 project = imgur