上传文件至 'dagongPro/spiders'

新增：将新闻标题、发布单位、发布日期、正文内容添加到MySQL数据库功能
2022-04-06 20:15:47 +08:00 · 2022-04-06 20:15:47 +08:00 · eace6a5c13
parent 700db04c9c
commit eace6a5c13
1 changed files with 3 additions and 2 deletions
--- a/dagongPro/spiders/dagong.py
+++ b/dagongPro/spiders/dagong.py
@ -18,7 +18,7 @@ class DagongSpider(scrapy.Spider):
    def parse(self, response):
        li_list = response.xpath('/html/body/div[2]/div/div[2]/ul[2]/li[2]/div/ul/li')
-        alist = [6]  # [0内地, 2香港, 4两岸, 6国际, 8军事]
+        alist = [8]  # [0内地, 2香港, 4两岸, 6国际, 8军事]
        for index in alist:
            model_url = li_list[index].xpath('./a/@href').extract_first()
            self.models_urls.append(model_url)
@ -34,7 +34,7 @@ class DagongSpider(scrapy.Spider):
            name = div.xpath('./dd[1]/a/text()').extract_first()
            # 取出标题中的特殊字符
-            title = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]|\\*.*?|\\:.*?|\\/.*?|\\\".*?|\\\\.*?|\\?.*?", "", name)
+            title = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]|\\*.*?|\\:.*?|\\/.*?|\\\".*?|\\\\.*?|\\?.*?\\|.*?", "", name)
            new_detail_url = div.xpath('./dd[1]/a/@href').extract_first()
            # print(title)
@ -50,6 +50,7 @@ class DagongSpider(scrapy.Spider):
        title = response.xpath('//div[@class="wrap_left"]/h2/text()').extract_first()
        content = response.xpath('//div[@class="wrap_left"]/div[3]//text()').extract()
        content = ''.join(content)
        print(content)
        date = response.xpath('//div[@class="wrap_left"]/div[1]/div[1]//text()').extract()
        date = ''.join(date)