From 49af5c71b8ff8bed68eb7302878c94ba6b00bb9d Mon Sep 17 00:00:00 2001
From: link_1999 <1402246900@qq.com>
Date: Mon, 4 Apr 2022 21:40:53 +0800
Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?=
 =?UTF-8?q?=20'dagongPro/spiders'?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

新增：去除新闻标题包含的特殊字符
---
 dagongPro/spiders/__init__.py |  4 +++
 dagongPro/spiders/dagong.py   | 63 +++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 dagongPro/spiders/__init__.py
 create mode 100644 dagongPro/spiders/dagong.py

diff --git a/dagongPro/spiders/__init__.py b/dagongPro/spiders/__init__.py
new file mode 100644
index 0000000..ebd689a
--- /dev/null
+++ b/dagongPro/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/dagongPro/spiders/dagong.py b/dagongPro/spiders/dagong.py
new file mode 100644
index 0000000..799a06a
--- /dev/null
+++ b/dagongPro/spiders/dagong.py
@@ -0,0 +1,63 @@
+import scrapy
+from selenium import webdriver
+from dagongwangPro.items import DagongwangproItem
+import re
+
+
+class DagongSpider(scrapy.Spider):
+    name = 'dagong'
+    # allowed_domains = ['www.xxx.com']
+    start_urls = ['http://www.takungpao.com/news/index.html']
+    models_urls = []  # 存放5个板块的url
+
+    # 解析五大板块对应的详情页url数据
+    # 实例化浏览器对象
+    def __init__(self):
+        self.bro = webdriver.Chrome(executable_path='D:\pythondemo\crawler\chromedriver.exe')
+
+
+    def parse(self, response):
+        li_list = response.xpath('/html/body/div[2]/div/div[2]/ul[2]/li[2]/div/ul/li')
+        alist = [6]  # [0内地, 2香港, 4两岸, 6国际, 8军事]
+        for index in alist:
+            model_url = li_list[index].xpath('./a/@href').extract_first()
+            self.models_urls.append(model_url)
+
+            # 依次对每个板块对应的Url发送请求
+            for url in self.models_urls:
+                yield scrapy.Request(url, callback=self.parse_model)
+
+    # 解析每个板块中新闻标题对应的详情页内容
+    def parse_model(self, response):
+        div_list = response.xpath('//div[@class="wrapper clearfix"]/div[1]/dl')
+        for div in div_list:
+            name = div.xpath('./dd[1]/a/text()').extract_first()
+
+            # 取出标题中的特殊字符
+            title = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]|\\*.*?|\\:.*?|\\/.*?|\\\".*?|\\\\.*?|\\?.*?", "", name)
+
+            new_detail_url = div.xpath('./dd[1]/a/@href').extract_first()
+            # print(title)
+
+            item = DagongwangproItem()
+            item['title'] = title
+
+            # 对新闻详情页url发起请求
+            yield scrapy.Request(url=new_detail_url, callback=self.parse_detail, meta={'item': item})
+
+    # 解析新闻内容
+    def parse_detail(self, response):
+        title = response.xpath('//div[@class="wrap_left"]/h2/text()').extract_first()
+        content = response.xpath('//div[@class="wrap_left"]/div[3]//text()').extract()
+        content = ''.join(content)
+        date = response.xpath('//div[@class="wrap_left"]/div[1]/div[1]//text()').extract()
+        date = ''.join(date)
+
+        item = response.meta['item']
+        item['content'] = content
+        item['date'] = date
+
+        yield item
+
+    def close(self, spider):
+        self.bro.quit()