IRProject/photo_scraper/spiders/stock123rf.py

import scrapy
import re
from scrapy_splash import SplashRequest
import dateparser

class Stock123refSpider(scrapy.Spider):
    name = "123rfscraper"

    start_urls = ["https://www.flickr.com/photos/tags/train",
                  "https://www.flickr.com/photos/tags/tree",
                  "https://www.flickr.com/photos/tags/outside"]

    def parse_photo(self, response):
        adesc = re.compile("<h2 class=\" meta-field photo-desc \">(.*)<\\/h2>", re.DOTALL)
        ad = re.compile("(\\w+\\s*\\d+\\s*,\\s*\\d+)", re.MULTILINE)

        photo = {}
        photo["author"] = response.css("a.owner-name::text").get().strip(' \n\t')
        photo["title"] = response.css("h1::text").get().strip(' \n\t')

        photo["description"] = ''.join(adesc.findall(response.text)).strip(' \n\t')
        date = response.css("span.date-taken-label::text").get()
        date = ad.findall(date)
        date = date[0]

        photo["date"] = dateparser.parse(date)
        if photo["date"] is not None:
            photo["date"] = photo["date"].strftime("%Y-%m-%d")
        else:
            photo["date"] = ''
        photo["img_url"] = "https:" + response.css("img.main-photo::attr('src')").get()
        photo["tags"] = response.css("ul.tags-list li a[title]::text").getall()
        yield photo
Renamed scrapy project 2020-11-10 17:36:41 +00:00			`import scrapy`
			`import re`
			`from scrapy_splash import SplashRequest`
			`import dateparser`

			`class Stock123refSpider(scrapy.Spider):`
			`name = "123rfscraper"`

			`start_urls = ["https://www.flickr.com/photos/tags/train",`
			`"https://www.flickr.com/photos/tags/tree",`
			`"https://www.flickr.com/photos/tags/outside"]`

			`def parse_photo(self, response):`
			`adesc = re.compile("<h2 class=\" meta-field photo-desc \">(.*)<\\/h2>", re.DOTALL)`
			`ad = re.compile("(\\w+\\s\\d+\\s,\\s*\\d+)", re.MULTILINE)`

			`photo = {}`
			`photo["author"] = response.css("a.owner-name::text").get().strip(' \n\t')`
			`photo["title"] = response.css("h1::text").get().strip(' \n\t')`

			`photo["description"] = ''.join(adesc.findall(response.text)).strip(' \n\t')`
			`date = response.css("span.date-taken-label::text").get()`
			`date = ad.findall(date)`
			`date = date[0]`

			`photo["date"] = dateparser.parse(date)`
			`if photo["date"] is not None:`
			`photo["date"] = photo["date"].strftime("%Y-%m-%d")`
			`else:`
			`photo["date"] = ''`
			`photo["img_url"] = "https:" + response.css("img.main-photo::attr('src')").get()`
			`photo["tags"] = response.css("ul.tags-list li a[title]::text").getall()`
			`yield photo`