Added 123RF parser
This commit is contained in:
parent
9845f6bbd2
commit
4edfcd99d7
2 changed files with 14 additions and 26 deletions
|
@ -6,29 +6,17 @@ import dateparser
|
||||||
class Stock123refSpider(scrapy.Spider):
|
class Stock123refSpider(scrapy.Spider):
|
||||||
name = "123rfscraper"
|
name = "123rfscraper"
|
||||||
|
|
||||||
start_urls = ["https://www.flickr.com/photos/tags/train",
|
start_urls = ["https://www.123rf.com/stock-photo/"]
|
||||||
"https://www.flickr.com/photos/tags/tree",
|
|
||||||
"https://www.flickr.com/photos/tags/outside"]
|
def parse(self, response):
|
||||||
|
links = response.css('.index-stockphoto-thumb-container a::attr("href")').getall()
|
||||||
|
for link in links:
|
||||||
|
yield response.follow(link, self.parse_photo_list)
|
||||||
|
|
||||||
|
def parse_photo_list(self, response):
|
||||||
|
links = response.css('.mosaic-main-container a::attr("href")').getall()
|
||||||
|
for link in links:
|
||||||
|
yield response.follow(link, self.parse_photo)
|
||||||
|
|
||||||
def parse_photo(self, response):
|
def parse_photo(self, response):
|
||||||
adesc = re.compile("<h2 class=\" meta-field photo-desc \">(.*)<\\/h2>", re.DOTALL)
|
yield [response.css('title::text').get()]
|
||||||
ad = re.compile("(\\w+\\s*\\d+\\s*,\\s*\\d+)", re.MULTILINE)
|
|
||||||
|
|
||||||
photo = {}
|
|
||||||
photo["author"] = response.css("a.owner-name::text").get().strip(' \n\t')
|
|
||||||
photo["title"] = response.css("h1::text").get().strip(' \n\t')
|
|
||||||
|
|
||||||
photo["description"] = ''.join(adesc.findall(response.text)).strip(' \n\t')
|
|
||||||
date = response.css("span.date-taken-label::text").get()
|
|
||||||
date = ad.findall(date)
|
|
||||||
date = date[0]
|
|
||||||
|
|
||||||
photo["date"] = dateparser.parse(date)
|
|
||||||
if photo["date"] is not None:
|
|
||||||
photo["date"] = photo["date"].strftime("%Y-%m-%d")
|
|
||||||
else:
|
|
||||||
photo["date"] = ''
|
|
||||||
photo["img_url"] = "https:" + response.css("img.main-photo::attr('src')").get()
|
|
||||||
photo["tags"] = response.css("ul.tags-list li a[title]::text").getall()
|
|
||||||
yield photo
|
|
||||||
|
|
||||||
|
|
|
@ -4,8 +4,8 @@
|
||||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||||
|
|
||||||
[settings]
|
[settings]
|
||||||
default = imgur.settings
|
default = photo_scraper.settings
|
||||||
|
|
||||||
[deploy]
|
[deploy]
|
||||||
#url = http://localhost:6800/
|
#url = http://localhost:6800/
|
||||||
project = imgur
|
project = photo_scraper
|
||||||
|
|
Reference in a new issue