crawler-MiniProgram/dagongPro/spiders/dagong.py

74 lines
2.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import scrapy
from selenium import webdriver
from dagongwangPro.items import DagongwangproItem
import re
class DagongSpider(scrapy.Spider):
name = 'dagong'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://www.takungpao.com/news/index.html']
models_urls = [] # 存放5个板块的url
# 解析五大板块对应的详情页url数据
# 实例化浏览器对象
def __init__(self):
self.bro = webdriver.Chrome(executable_path='D:\pythondemo\crawler\chromedriver.exe')
def parse(self, response):
li_list = response.xpath('/html/body/div[2]/div/div[2]/ul[2]/li[2]/div/ul/li')
alist = [0, 2, 4, 6, 8] # [0内地, 2香港, 4两岸, 6国际, 8军事]
for index in alist:
model_url = li_list[index].xpath('./a/@href').extract_first()
self.models_urls.append(model_url)
# 依次对每个板块对应的Url发送请求
for url in self.models_urls:
# 4.11新增新闻板块代码index
yield scrapy.Request(url, callback=self.parse_model, meta={'index': index})
# 解析每个板块中新闻标题对应的详情页内容
def parse_model(self, response):
div_list = response.xpath('//div[@class="wrapper clearfix"]/div[1]/dl')
for div in div_list:
name = div.xpath('./dd[1]/a/text()').extract_first()
# 图片对应的地址
src = div.xpath('./dt/a/img/@src').extract_first()
# 取出标题中的特殊字符
title = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]|\\*.*?|\\:.*?|\\/.*?|\\\".*?|\\\\.*?|\\?.*?\\|.*?", "", name)
new_detail_url = div.xpath('./dd[1]/a/@href').extract_first()
news_id = 'newsid'+(new_detail_url.split('/')[-1]).split('.')[0]
item = DagongwangproItem()
item['title'] = title
item['src'] = src
item['news_id'] = news_id
# 4.11新增新闻板块代码index
item['index'] = response.meta['index']
# 对新闻详情页url发起请求
yield scrapy.Request(url=new_detail_url, callback=self.parse_detail, meta={'item': item})
# 解析新闻内容
def parse_detail(self, response):
title = response.xpath('//div[@class="wrap_left"]/h2/text()').extract_first()
content = response.xpath('//div[@class="wrap_left"]/div[3]//text()').extract()
content = ''.join(content)
date = response.xpath('//div[@class="wrap_left"]/div[1]/div[1]/span[1]/text()').extract_first()
author = response.xpath('//div[@class="wrap_left"]/div[1]/div[1]/span[2]/text() | //div[@class="wrap_left"]/div[1]/div[1]/span[2]/a/text()').extract_first()
item = response.meta['item']
item['content'] = content
item['date'] = date
item['author'] = author
yield item
def close(self, spider):
self.bro.quit()