import scrapy import re from scrapy_splash import SplashRequest import dateparser class Stock123refSpider(scrapy.Spider): name = "123rfscraper" start_urls = ["https://www.123rf.com/stock-photo/"] def parse(self, response): links = response.css('.index-stockphoto-thumb-container a::attr("href")').getall() for link in links: yield response.follow(link, self.parse_photo_list) def parse_photo_list(self, response): links = response.css('.mosaic-main-container a::attr("href")').getall() for link in links: yield response.follow(link, self.parse_photo) a = response.css('#btn_main_nextpg::attr("href")').get() if a is not None: yield response.follow(a, self.parse_photo_list) def parse_photo(self, response): if response.css('h1::text').get() is None: return a = response.css('#contributorPortfolioLink::text').get() b = response.css("#imageDescriptionText::text").get() yield {"author": "" if a is None else a.strip("\t\r\n "), "title": response.css('h1::text').get().strip("\t\r\n "), "description": "" if b is None else b.strip("\t\r\n "), "date": "", "img_url": response.css("picture source::attr('srcset')").get(), "tags": response.css('.keywords-container a div.ui.label::text').getall()}