上传文件至 'dagongPro/spiders'

2022-04-08 22:32:47 +08:00 · 2022-04-08 22:32:47 +08:00 · 7c9cfd3292
parent 88211ea6a5
commit 7c9cfd3292
1 changed files with 7 additions and 4 deletions
--- a/dagongPro/spiders/dagong.py
+++ b/dagongPro/spiders/dagong.py
@ -17,7 +17,7 @@ class DagongSpider(scrapy.Spider):

    def parse(self, response):
        li_list = response.xpath('/html/body/div[2]/div/div[2]/ul[2]/li[2]/div/ul/li')
-        alist = [8]  # [0内地, 2香港, 4两岸, 6国际, 8军事]
+        alist = [4]  # [0内地, 2香港, 4两岸, 6国际, 8军事]
        for index in alist:
            model_url = li_list[index].xpath('./a/@href').extract_first()
            self.models_urls.append(model_url)
@ -38,11 +38,12 @@ class DagongSpider(scrapy.Spider):
            title = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]|\\*.*?|\\:.*?|\\/.*?|\\\".*?|\\\\.*?|\\?.*?\\|.*?", "", name)

            new_detail_url = div.xpath('./dd[1]/a/@href').extract_first()
-            # print(title)
+            news_id = 'newsid'+(new_detail_url.split('/')[-1]).split('.')[0]

            item = DagongwangproItem()
            item['title'] = title
            item['src'] = src
+            item['news_id'] = news_id

            # 对新闻详情页url发起请求
            yield scrapy.Request(url=new_detail_url, callback=self.parse_detail, meta={'item': item})
@ -50,11 +51,13 @@ class DagongSpider(scrapy.Spider):
    # 解析新闻内容
    def parse_detail(self, response):
        title = response.xpath('//div[@class="wrap_left"]/h2/text()').extract_first()
+
        content = response.xpath('//div[@class="wrap_left"]/div[3]//text()').extract()
        content = ''.join(content)
-        print(content)
+
        date = response.xpath('//div[@class="wrap_left"]/div[1]/div[1]/span[1]/text()').extract_first()
-        author = response.xpath('//div[@class="wrap_left"]/div[1]/div[1]/span[2]/text()').extract_first()
+
+        author = response.xpath('//div[@class="wrap_left"]/div[1]/div[1]/span[2]/text() | //div[@class="wrap_left"]/div[1]/div[1]/span[2]/a/text()').extract_first()

        item = response.meta['item']
        item['content'] = content