IRProject/photo_scraper/spiders/flickr.py

import scrapy
import re
from scrapy_splash import SplashRequest
import dateparser

class ExploreSpider(scrapy.Spider):
    name = "explorescraper"
    terms = ['usi', 'eth', 'epfl', 'lugano', 'zurich', 'basel', 'oliveto%20lario', 'homework', 'plagiarism', 'rhb', 'sbb',
             'ship', 'frighten', 'fool', 'have', 'inspire', 'heal', 'master', 'terminate', 'amend', 'scratch', 'embark',
             'entail', 'execute', 'consolidate', 'cash', 'round', 'isolate', 'warrant', 'signal', 'weaken', 'pin',
             'march', 'desire', 'widen', 'level', 'chat', 'board', 'contend', 'invent', 'resource', 'manufacture', 'seal',
             'reconsider', 'suck', 'picture', 'crash', 'transport', 'plug', 'assign', 'enquire', 'campaign', 'trap',
             'surround', 'debate', 'upgrade', 'decorate', 'confer', 'accumulate', 'profit', 'file', 'inherit', 'disrupt',
             'contrast', 'chuck', 'tick', 'plead', 'dip', 'subscribe', 'educate', 'divorce', 'spin', 'row', 'obscure',
             'creep', 'interest', 'overlook', 'twist', 'mature', 'blend', 'revise', 'attribute', 'explode', 'dwell',
             'drown', 'alleviate', 'strip', 'grade', 'revert', 'value', 'award', 'strive', 'notify', 'remedy', 'accuse',
             'instruct', 'spill', 'strain', 'comprehend', 'soften', 'postpone', 'wave', 'bounce', 'stock', 'position',
             'insure', 'adhere', 'cling', 'summon','pause','empty','classify']

    infinite_scroll_urls = ["https://flickr.com/explore"]

    start_urls = ["https://www.flickr.com/photos/tags/train",
                  "https://www.flickr.com/photos/tags/tree",
                  "https://www.flickr.com/photos/tags/outside"]

    def parse_photo(self, response):
        adesc = re.compile("<h2 class=\" meta-field photo-desc \">(.*)<\\/h2>", re.DOTALL)
        ad = re.compile("(\\w+\\s*\\d+\\s*,\\s*\\d+)", re.MULTILINE)

        photo = {}
        photo["author"] = response.css("a.owner-name::text").get().strip(' \n\t')
        photo["title"] = response.css("h1::text").get().strip(' \n\t')

        photo["description"] = ''.join(adesc.findall(response.text)).strip(' \n\t')
        date = response.css("span.date-taken-label::text").get()
        date = ad.findall(date)
        date = date[0]

        photo["date"] = dateparser.parse(date)
        if photo["date"] is not None:
            photo["date"] = photo["date"].strftime("%Y-%m-%d")
        else:
            photo["date"] = ''
        photo["img_url"] = "https:" + response.css("img.main-photo::attr('src')").get()
        photo["tags"] = response.css("ul.tags-list li a[title]::text").getall()
        yield photo

    def start_requests(self):
        for term in self.terms:
            self.infinite_scroll_urls.append("https://www.flickr.com/search/?text=" + term)

        for url in self.infinite_scroll_urls:

            # Load Splash lua script to scroll the page a few times
            script = ""
            with open("infinite_scroll.lua", "r") as f:
                script = f.read()

            # Build Splash request
            yield SplashRequest(url=url, callback=self.parse, endpoint='execute', args={
                    "wait": 3,
                    "timeout": 90,
                    "lua_source": script,
                    "headers": {'User-Agent': "Mozilla/5.0 (X11; Linux x86_64; rv:82.0) Gecko/20100101 Firefox/82.0"}
                })

    def parse(self, response):
        for q in response.css(".photo-list-photo-interaction"):
            link = q.css("a::attr('href')").get()
            yield response.follow(link, self.parse_photo)
Done scraper 2020-11-04 14:36:32 +00:00			`import scrapy`
			`import re`
			`from scrapy_splash import SplashRequest`
			`import dateparser`

			`class ExploreSpider(scrapy.Spider):`
			`name = "explorescraper"`
			`terms = ['usi', 'eth', 'epfl', 'lugano', 'zurich', 'basel', 'oliveto%20lario', 'homework', 'plagiarism', 'rhb', 'sbb',`
			`'ship', 'frighten', 'fool', 'have', 'inspire', 'heal', 'master', 'terminate', 'amend', 'scratch', 'embark',`
			`'entail', 'execute', 'consolidate', 'cash', 'round', 'isolate', 'warrant', 'signal', 'weaken', 'pin',`
			`'march', 'desire', 'widen', 'level', 'chat', 'board', 'contend', 'invent', 'resource', 'manufacture', 'seal',`
			`'reconsider', 'suck', 'picture', 'crash', 'transport', 'plug', 'assign', 'enquire', 'campaign', 'trap',`
			`'surround', 'debate', 'upgrade', 'decorate', 'confer', 'accumulate', 'profit', 'file', 'inherit', 'disrupt',`
			`'contrast', 'chuck', 'tick', 'plead', 'dip', 'subscribe', 'educate', 'divorce', 'spin', 'row', 'obscure',`
			`'creep', 'interest', 'overlook', 'twist', 'mature', 'blend', 'revise', 'attribute', 'explode', 'dwell',`
			`'drown', 'alleviate', 'strip', 'grade', 'revert', 'value', 'award', 'strive', 'notify', 'remedy', 'accuse',`
			`'instruct', 'spill', 'strain', 'comprehend', 'soften', 'postpone', 'wave', 'bounce', 'stock', 'position',`
			`'insure', 'adhere', 'cling', 'summon','pause','empty','classify']`

			`infinite_scroll_urls = ["https://flickr.com/explore"]`

			`start_urls = ["https://www.flickr.com/photos/tags/train",`
			`"https://www.flickr.com/photos/tags/tree",`
			`"https://www.flickr.com/photos/tags/outside"]`

			`def parse_photo(self, response):`
			`adesc = re.compile("<h2 class=\" meta-field photo-desc \">(.*)<\\/h2>", re.DOTALL)`
			`ad = re.compile("(\\w+\\s\\d+\\s,\\s*\\d+)", re.MULTILINE)`

			`photo = {}`
			`photo["author"] = response.css("a.owner-name::text").get().strip(' \n\t')`
			`photo["title"] = response.css("h1::text").get().strip(' \n\t')`

			`photo["description"] = ''.join(adesc.findall(response.text)).strip(' \n\t')`
			`date = response.css("span.date-taken-label::text").get()`
			`date = ad.findall(date)`
			`date = date[0]`

			`photo["date"] = dateparser.parse(date)`
			`if photo["date"] is not None:`
			`photo["date"] = photo["date"].strftime("%Y-%m-%d")`
			`else:`
			`photo["date"] = ''`
			`photo["img_url"] = "https:" + response.css("img.main-photo::attr('src')").get()`
			`photo["tags"] = response.css("ul.tags-list li a[title]::text").getall()`
			`yield photo`

			`def start_requests(self):`
			`for term in self.terms:`
			`self.infinite_scroll_urls.append("https://www.flickr.com/search/?text=" + term)`

			`for url in self.infinite_scroll_urls:`

			`# Load Splash lua script to scroll the page a few times`
			`script = ""`
			`with open("infinite_scroll.lua", "r") as f:`
			`script = f.read()`

			`# Build Splash request`
			`yield SplashRequest(url=url, callback=self.parse, endpoint='execute', args={`
			`"wait": 3,`
			`"timeout": 90,`
			`"lua_source": script,`
			`"headers": {'User-Agent': "Mozilla/5.0 (X11; Linux x86_64; rv:82.0) Gecko/20100101 Firefox/82.0"}`
			`})`

			`def parse(self, response):`
			`for q in response.css(".photo-list-photo-interaction"):`
			`link = q.css("a::attr('href')").get()`
			`yield response.follow(link, self.parse_photo)`