import scrapy from selenium import webdriver from dagongwangPro.items import DagongwangproItem import re class DagongSpider(scrapy.Spider): name = 'dagong' # allowed_domains = ['www.xxx.com'] start_urls = ['http://www.takungpao.com/news/index.html'] models_urls = [] # 存放5个板块的url # 解析五大板块对应的详情页url数据 # 实例化浏览器对象 def __init__(self): self.bro = webdriver.Chrome(executable_path='D:\pythondemo\crawler\chromedriver.exe') def parse(self, response): li_list = response.xpath('/html/body/div[2]/div/div[2]/ul[2]/li[2]/div/ul/li') alist = [0, 2, 4, 6, 8] # [0内地, 2香港, 4两岸, 6国际, 8军事] for index in alist: model_url = li_list[index].xpath('./a/@href').extract_first() self.models_urls.append(model_url) # 依次对每个板块对应的Url发送请求 for url in self.models_urls: # 4.11新增:新闻板块代码index yield scrapy.Request(url, callback=self.parse_model, meta={'index': index}) # 解析每个板块中新闻标题对应的详情页内容 def parse_model(self, response): div_list = response.xpath('//div[@class="wrapper clearfix"]/div[1]/dl') for div in div_list: name = div.xpath('./dd[1]/a/text()').extract_first() # 图片对应的地址 src = div.xpath('./dt/a/img/@src').extract_first() # 取出标题中的特殊字符 title = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]|\\*.*?|\\:.*?|\\/.*?|\\\".*?|\\\\.*?|\\?.*?\\|.*?", "", name) new_detail_url = div.xpath('./dd[1]/a/@href').extract_first() news_id = 'newsid'+(new_detail_url.split('/')[-1]).split('.')[0] item = DagongwangproItem() item['title'] = title item['src'] = src item['news_id'] = news_id # 4.11新增:新闻板块代码index item['index'] = response.meta['index'] # 对新闻详情页url发起请求 yield scrapy.Request(url=new_detail_url, callback=self.parse_detail, meta={'item': item}) # 解析新闻内容 def parse_detail(self, response): title = response.xpath('//div[@class="wrap_left"]/h2/text()').extract_first() content = response.xpath('//div[@class="wrap_left"]/div[3]//text()').extract() content = ''.join(content) date = response.xpath('//div[@class="wrap_left"]/div[1]/div[1]/span[1]/text()').extract_first() author = response.xpath('//div[@class="wrap_left"]/div[1]/div[1]/span[2]/text() | //div[@class="wrap_left"]/div[1]/div[1]/span[2]/a/text()').extract_first() item = response.meta['item'] item['content'] = content item['date'] = date item['author'] = author yield item def close(self, spider): self.bro.quit()