parent
cd8ab806a0
commit
49af5c71b8
|
@ -0,0 +1,4 @@
|
|||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
|
@ -0,0 +1,63 @@
|
|||
import scrapy
|
||||
from selenium import webdriver
|
||||
from dagongwangPro.items import DagongwangproItem
|
||||
import re
|
||||
|
||||
|
||||
class DagongSpider(scrapy.Spider):
|
||||
name = 'dagong'
|
||||
# allowed_domains = ['www.xxx.com']
|
||||
start_urls = ['http://www.takungpao.com/news/index.html']
|
||||
models_urls = [] # 存放5个板块的url
|
||||
|
||||
# 解析五大板块对应的详情页url数据
|
||||
# 实例化浏览器对象
|
||||
def __init__(self):
|
||||
self.bro = webdriver.Chrome(executable_path='D:\pythondemo\crawler\chromedriver.exe')
|
||||
|
||||
|
||||
def parse(self, response):
|
||||
li_list = response.xpath('/html/body/div[2]/div/div[2]/ul[2]/li[2]/div/ul/li')
|
||||
alist = [6] # [0内地, 2香港, 4两岸, 6国际, 8军事]
|
||||
for index in alist:
|
||||
model_url = li_list[index].xpath('./a/@href').extract_first()
|
||||
self.models_urls.append(model_url)
|
||||
|
||||
# 依次对每个板块对应的Url发送请求
|
||||
for url in self.models_urls:
|
||||
yield scrapy.Request(url, callback=self.parse_model)
|
||||
|
||||
# 解析每个板块中新闻标题对应的详情页内容
|
||||
def parse_model(self, response):
|
||||
div_list = response.xpath('//div[@class="wrapper clearfix"]/div[1]/dl')
|
||||
for div in div_list:
|
||||
name = div.xpath('./dd[1]/a/text()').extract_first()
|
||||
|
||||
# 取出标题中的特殊字符
|
||||
title = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]|\\*.*?|\\:.*?|\\/.*?|\\\".*?|\\\\.*?|\\?.*?", "", name)
|
||||
|
||||
new_detail_url = div.xpath('./dd[1]/a/@href').extract_first()
|
||||
# print(title)
|
||||
|
||||
item = DagongwangproItem()
|
||||
item['title'] = title
|
||||
|
||||
# 对新闻详情页url发起请求
|
||||
yield scrapy.Request(url=new_detail_url, callback=self.parse_detail, meta={'item': item})
|
||||
|
||||
# 解析新闻内容
|
||||
def parse_detail(self, response):
|
||||
title = response.xpath('//div[@class="wrap_left"]/h2/text()').extract_first()
|
||||
content = response.xpath('//div[@class="wrap_left"]/div[3]//text()').extract()
|
||||
content = ''.join(content)
|
||||
date = response.xpath('//div[@class="wrap_left"]/div[1]/div[1]//text()').extract()
|
||||
date = ''.join(date)
|
||||
|
||||
item = response.meta['item']
|
||||
item['content'] = content
|
||||
item['date'] = date
|
||||
|
||||
yield item
|
||||
|
||||
def close(self, spider):
|
||||
self.bro.quit()
|
Loading…
Reference in New Issue