import scrapy import re from scrapy_splash import SplashRequest import dateparser class Stock123refSpider(scrapy.Spider): name = "123rfscraper" start_urls = ["https://www.flickr.com/photos/tags/train", "https://www.flickr.com/photos/tags/tree", "https://www.flickr.com/photos/tags/outside"] def parse_photo(self, response): adesc = re.compile("

(.*)<\\/h2>", re.DOTALL) ad = re.compile("(\\w+\\s*\\d+\\s*,\\s*\\d+)", re.MULTILINE) photo = {} photo["author"] = response.css("a.owner-name::text").get().strip(' \n\t') photo["title"] = response.css("h1::text").get().strip(' \n\t') photo["description"] = ''.join(adesc.findall(response.text)).strip(' \n\t') date = response.css("span.date-taken-label::text").get() date = ad.findall(date) date = date[0] photo["date"] = dateparser.parse(date) if photo["date"] is not None: photo["date"] = photo["date"].strftime("%Y-%m-%d") else: photo["date"] = '' photo["img_url"] = "https:" + response.css("img.main-photo::attr('src')").get() photo["tags"] = response.css("ul.tags-list li a[title]::text").getall() yield photo