diff --git a/dagongPro/spiders/dagong.py b/dagongPro/spiders/dagong.py index 454fd8d..96bbf45 100644 --- a/dagongPro/spiders/dagong.py +++ b/dagongPro/spiders/dagong.py @@ -17,7 +17,7 @@ class DagongSpider(scrapy.Spider): def parse(self, response): li_list = response.xpath('/html/body/div[2]/div/div[2]/ul[2]/li[2]/div/ul/li') - alist = [8] # [0内地, 2香港, 4两岸, 6国际, 8军事] + alist = [4] # [0内地, 2香港, 4两岸, 6国际, 8军事] for index in alist: model_url = li_list[index].xpath('./a/@href').extract_first() self.models_urls.append(model_url) @@ -38,11 +38,12 @@ class DagongSpider(scrapy.Spider): title = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]|\\*.*?|\\:.*?|\\/.*?|\\\".*?|\\\\.*?|\\?.*?\\|.*?", "", name) new_detail_url = div.xpath('./dd[1]/a/@href').extract_first() - # print(title) + news_id = 'newsid'+(new_detail_url.split('/')[-1]).split('.')[0] item = DagongwangproItem() item['title'] = title item['src'] = src + item['news_id'] = news_id # 对新闻详情页url发起请求 yield scrapy.Request(url=new_detail_url, callback=self.parse_detail, meta={'item': item}) @@ -50,11 +51,13 @@ class DagongSpider(scrapy.Spider): # 解析新闻内容 def parse_detail(self, response): title = response.xpath('//div[@class="wrap_left"]/h2/text()').extract_first() + content = response.xpath('//div[@class="wrap_left"]/div[3]//text()').extract() content = ''.join(content) - print(content) + date = response.xpath('//div[@class="wrap_left"]/div[1]/div[1]/span[1]/text()').extract_first() - author = response.xpath('//div[@class="wrap_left"]/div[1]/div[1]/span[2]/text()').extract_first() + + author = response.xpath('//div[@class="wrap_left"]/div[1]/div[1]/span[2]/text() | //div[@class="wrap_left"]/div[1]/div[1]/span[2]/a/text()').extract_first() item = response.meta['item'] item['content'] = content