| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 
 | from selenium import webdriverimport scrapy
 from scrapy.linkextractors import LinkExtractor
 from scrapy.spiders import CrawlSpider, Rule
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from ..items import PicItem
 
 class MirrowSpider(CrawlSpider):
 name = "mirrow"
 
 allowed_domains = ["dimtown.com"]
 start_urls = ["https://dimtown.com/cosplay/page/1"]
 
 rules = (
 
 Rule(LinkExtractor(allow=r'/cosplay/page/[1-2]'), follow=True),
 Rule(LinkExtractor(allow=r'/\d+\.html'), callback='parse_item', follow=False),
 )
 
 def __init__(self, *args, **kwargs):
 super(MirrowSpider, self).__init__(*args, **kwargs)
 self.driver = webdriver.Chrome()
 
 def parse_item(self, response):
 self.driver.get(response.url)
 
 user_agent = response.request.headers.get('User-Agent').decode('utf-8')
 self.logger.info(f"当前使用的 User-Agent: {user_agent}")
 
 self.driver.get(response.url)
 wait = WebDriverWait(self.driver, 10)
 try:
 
 title_element = WebDriverWait(self.driver, 10).until(
 
 EC.visibility_of_element_located((By.XPATH, '//h1'))
 )
 title = title_element.text.strip()
 self.logger.info("标题:%s", title)
 
 
 
 img_url_elements = self.driver.find_elements(By.XPATH, '//img[@decoding="async"]')
 
 
 img_urls = []
 
 
 for img_url_element in img_url_elements:
 img_url = img_url_element.get_attribute('src')
 img_urls.append(img_url)
 
 self.logger.info("所有图片网址:%s", img_urls)
 
 
 item = PicItem(image_urls=img_urls, title=title)
 
 
 yield item
 
 except Exception as e:
 self.logger.error("An error occurred: %s", e)
 
 def closed(self, reason):
 
 self.driver.quit()
 
 |