35 lines
1.3 KiB
Python
35 lines
1.3 KiB
Python
|
import scrapy
|
||
|
import re
|
||
|
from scrapy_splash import SplashRequest
|
||
|
import dateparser
|
||
|
|
||
|
class Stock123refSpider(scrapy.Spider):
|
||
|
name = "123rfscraper"
|
||
|
|
||
|
start_urls = ["https://www.flickr.com/photos/tags/train",
|
||
|
"https://www.flickr.com/photos/tags/tree",
|
||
|
"https://www.flickr.com/photos/tags/outside"]
|
||
|
|
||
|
def parse_photo(self, response):
|
||
|
adesc = re.compile("<h2 class=\" meta-field photo-desc \">(.*)<\\/h2>", re.DOTALL)
|
||
|
ad = re.compile("(\\w+\\s*\\d+\\s*,\\s*\\d+)", re.MULTILINE)
|
||
|
|
||
|
photo = {}
|
||
|
photo["author"] = response.css("a.owner-name::text").get().strip(' \n\t')
|
||
|
photo["title"] = response.css("h1::text").get().strip(' \n\t')
|
||
|
|
||
|
photo["description"] = ''.join(adesc.findall(response.text)).strip(' \n\t')
|
||
|
date = response.css("span.date-taken-label::text").get()
|
||
|
date = ad.findall(date)
|
||
|
date = date[0]
|
||
|
|
||
|
photo["date"] = dateparser.parse(date)
|
||
|
if photo["date"] is not None:
|
||
|
photo["date"] = photo["date"].strftime("%Y-%m-%d")
|
||
|
else:
|
||
|
photo["date"] = ''
|
||
|
photo["img_url"] = "https:" + response.css("img.main-photo::attr('src')").get()
|
||
|
photo["tags"] = response.css("ul.tags-list li a[title]::text").getall()
|
||
|
yield photo
|
||
|
|