上传文件至 'dagongPro/spiders'
This commit is contained in:
parent
88211ea6a5
commit
7c9cfd3292
|
@ -17,7 +17,7 @@ class DagongSpider(scrapy.Spider):
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
li_list = response.xpath('/html/body/div[2]/div/div[2]/ul[2]/li[2]/div/ul/li')
|
li_list = response.xpath('/html/body/div[2]/div/div[2]/ul[2]/li[2]/div/ul/li')
|
||||||
alist = [8] # [0内地, 2香港, 4两岸, 6国际, 8军事]
|
alist = [4] # [0内地, 2香港, 4两岸, 6国际, 8军事]
|
||||||
for index in alist:
|
for index in alist:
|
||||||
model_url = li_list[index].xpath('./a/@href').extract_first()
|
model_url = li_list[index].xpath('./a/@href').extract_first()
|
||||||
self.models_urls.append(model_url)
|
self.models_urls.append(model_url)
|
||||||
|
@ -38,11 +38,12 @@ class DagongSpider(scrapy.Spider):
|
||||||
title = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]|\\*.*?|\\:.*?|\\/.*?|\\\".*?|\\\\.*?|\\?.*?\\|.*?", "", name)
|
title = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]|\\*.*?|\\:.*?|\\/.*?|\\\".*?|\\\\.*?|\\?.*?\\|.*?", "", name)
|
||||||
|
|
||||||
new_detail_url = div.xpath('./dd[1]/a/@href').extract_first()
|
new_detail_url = div.xpath('./dd[1]/a/@href').extract_first()
|
||||||
# print(title)
|
news_id = 'newsid'+(new_detail_url.split('/')[-1]).split('.')[0]
|
||||||
|
|
||||||
item = DagongwangproItem()
|
item = DagongwangproItem()
|
||||||
item['title'] = title
|
item['title'] = title
|
||||||
item['src'] = src
|
item['src'] = src
|
||||||
|
item['news_id'] = news_id
|
||||||
|
|
||||||
# 对新闻详情页url发起请求
|
# 对新闻详情页url发起请求
|
||||||
yield scrapy.Request(url=new_detail_url, callback=self.parse_detail, meta={'item': item})
|
yield scrapy.Request(url=new_detail_url, callback=self.parse_detail, meta={'item': item})
|
||||||
|
@ -50,11 +51,13 @@ class DagongSpider(scrapy.Spider):
|
||||||
# 解析新闻内容
|
# 解析新闻内容
|
||||||
def parse_detail(self, response):
|
def parse_detail(self, response):
|
||||||
title = response.xpath('//div[@class="wrap_left"]/h2/text()').extract_first()
|
title = response.xpath('//div[@class="wrap_left"]/h2/text()').extract_first()
|
||||||
|
|
||||||
content = response.xpath('//div[@class="wrap_left"]/div[3]//text()').extract()
|
content = response.xpath('//div[@class="wrap_left"]/div[3]//text()').extract()
|
||||||
content = ''.join(content)
|
content = ''.join(content)
|
||||||
print(content)
|
|
||||||
date = response.xpath('//div[@class="wrap_left"]/div[1]/div[1]/span[1]/text()').extract_first()
|
date = response.xpath('//div[@class="wrap_left"]/div[1]/div[1]/span[1]/text()').extract_first()
|
||||||
author = response.xpath('//div[@class="wrap_left"]/div[1]/div[1]/span[2]/text()').extract_first()
|
|
||||||
|
author = response.xpath('//div[@class="wrap_left"]/div[1]/div[1]/span[2]/text() | //div[@class="wrap_left"]/div[1]/div[1]/span[2]/a/text()').extract_first()
|
||||||
|
|
||||||
item = response.meta['item']
|
item = response.meta['item']
|
||||||
item['content'] = content
|
item['content'] = content
|
||||||
|
|
Loading…
Reference in New Issue