From eace6a5c1312ddc048cdea30c76cec711f1b951b Mon Sep 17 00:00:00 2001 From: link_1999 <1402246900@qq.com> Date: Wed, 6 Apr 2022 20:15:47 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20'dagongPro/spiders'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 新增:将新闻标题、发布单位、发布日期、正文内容添加到MySQL数据库功能 --- dagongPro/spiders/dagong.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dagongPro/spiders/dagong.py b/dagongPro/spiders/dagong.py index 799a06a..c1bdf74 100644 --- a/dagongPro/spiders/dagong.py +++ b/dagongPro/spiders/dagong.py @@ -18,7 +18,7 @@ class DagongSpider(scrapy.Spider): def parse(self, response): li_list = response.xpath('/html/body/div[2]/div/div[2]/ul[2]/li[2]/div/ul/li') - alist = [6] # [0内地, 2香港, 4两岸, 6国际, 8军事] + alist = [8] # [0内地, 2香港, 4两岸, 6国际, 8军事] for index in alist: model_url = li_list[index].xpath('./a/@href').extract_first() self.models_urls.append(model_url) @@ -34,7 +34,7 @@ class DagongSpider(scrapy.Spider): name = div.xpath('./dd[1]/a/text()').extract_first() # 取出标题中的特殊字符 - title = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]|\\*.*?|\\:.*?|\\/.*?|\\\".*?|\\\\.*?|\\?.*?", "", name) + title = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]|\\*.*?|\\:.*?|\\/.*?|\\\".*?|\\\\.*?|\\?.*?\\|.*?", "", name) new_detail_url = div.xpath('./dd[1]/a/@href').extract_first() # print(title) @@ -50,6 +50,7 @@ class DagongSpider(scrapy.Spider): title = response.xpath('//div[@class="wrap_left"]/h2/text()').extract_first() content = response.xpath('//div[@class="wrap_left"]/div[3]//text()').extract() content = ''.join(content) + print(content) date = response.xpath('//div[@class="wrap_left"]/div[1]/div[1]//text()').extract() date = ''.join(date)