From 49af5c71b8ff8bed68eb7302878c94ba6b00bb9d Mon Sep 17 00:00:00 2001 From: link_1999 <1402246900@qq.com> Date: Mon, 4 Apr 2022 21:40:53 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20'dagongPro/spiders'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 新增:去除新闻标题包含的特殊字符 --- dagongPro/spiders/__init__.py | 4 +++ dagongPro/spiders/dagong.py | 63 +++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 dagongPro/spiders/__init__.py create mode 100644 dagongPro/spiders/dagong.py diff --git a/dagongPro/spiders/__init__.py b/dagongPro/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/dagongPro/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/dagongPro/spiders/dagong.py b/dagongPro/spiders/dagong.py new file mode 100644 index 0000000..799a06a --- /dev/null +++ b/dagongPro/spiders/dagong.py @@ -0,0 +1,63 @@ +import scrapy +from selenium import webdriver +from dagongwangPro.items import DagongwangproItem +import re + + +class DagongSpider(scrapy.Spider): + name = 'dagong' + # allowed_domains = ['www.xxx.com'] + start_urls = ['http://www.takungpao.com/news/index.html'] + models_urls = [] # 存放5个板块的url + + # 解析五大板块对应的详情页url数据 + # 实例化浏览器对象 + def __init__(self): + self.bro = webdriver.Chrome(executable_path='D:\pythondemo\crawler\chromedriver.exe') + + + def parse(self, response): + li_list = response.xpath('/html/body/div[2]/div/div[2]/ul[2]/li[2]/div/ul/li') + alist = [6] # [0内地, 2香港, 4两岸, 6国际, 8军事] + for index in alist: + model_url = li_list[index].xpath('./a/@href').extract_first() + self.models_urls.append(model_url) + + # 依次对每个板块对应的Url发送请求 + for url in self.models_urls: + yield scrapy.Request(url, callback=self.parse_model) + + # 解析每个板块中新闻标题对应的详情页内容 + def parse_model(self, response): + div_list = response.xpath('//div[@class="wrapper clearfix"]/div[1]/dl') + for div in div_list: + name = div.xpath('./dd[1]/a/text()').extract_first() + + # 取出标题中的特殊字符 + title = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]|\\*.*?|\\:.*?|\\/.*?|\\\".*?|\\\\.*?|\\?.*?", "", name) + + new_detail_url = div.xpath('./dd[1]/a/@href').extract_first() + # print(title) + + item = DagongwangproItem() + item['title'] = title + + # 对新闻详情页url发起请求 + yield scrapy.Request(url=new_detail_url, callback=self.parse_detail, meta={'item': item}) + + # 解析新闻内容 + def parse_detail(self, response): + title = response.xpath('//div[@class="wrap_left"]/h2/text()').extract_first() + content = response.xpath('//div[@class="wrap_left"]/div[3]//text()').extract() + content = ''.join(content) + date = response.xpath('//div[@class="wrap_left"]/div[1]/div[1]//text()').extract() + date = ''.join(date) + + item = response.meta['item'] + item['content'] = content + item['date'] = date + + yield item + + def close(self, spider): + self.bro.quit()