Crawled 123RF

2020-11-11 14:58:58 +01:00 · 2020-11-11 14:58:58 +01:00 · fb92add03f
commit fb92add03f
parent 4edfcd99d7
2 changed files with 50016 additions and 1 deletions
--- a/photo_scraper/spiders/stock123rf.py
+++ b/photo_scraper/spiders/stock123rf.py
@ -17,6 +17,20 @@ class Stock123refSpider(scrapy.Spider):
        links = response.css('.mosaic-main-container a::attr("href")').getall()
        for link in links:
            yield response.follow(link, self.parse_photo)
        a = response.css('#btn_main_nextpg::attr("href")').get()
        if a is not None:
            yield response.follow(a, self.parse_photo_list)
    def parse_photo(self, response):
-        yield [response.css('title::text').get()]
+        if response.css('h1::text').get() is None:
            return
        a = response.css('#contributorPortfolioLink::text').get()
        b = response.css("#imageDescriptionText::text").get()
        yield {"author": "" if a is None else a.strip("\t\r\n "),
                "title": response.css('h1::text').get().strip("\t\r\n "),
               "description": "" if b is None else b.strip("\t\r\n "),
               "date": "",
               "img_url": response.css("picture source::attr('srcset')").get(),
               "tags": response.css('.keywords-container a div.ui.label::text').getall()}
--- a/scraped/123rf.csv
+++ b/scraped/123rf.csv