parent
8943cfa7c6
commit
30bcca17d0
|
@ -0,0 +1,4 @@
|
||||||
|
# This package will contain the spiders of your Scrapy project
|
||||||
|
#
|
||||||
|
# Please refer to the documentation for information on how to create and manage
|
||||||
|
# your spiders.
|
|
@ -0,0 +1,182 @@
|
||||||
|
import scrapy
|
||||||
|
from selenium import webdriver
|
||||||
|
from zjjtPro.items import ZjjtproItem
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
class ZjjtSpider(scrapy.Spider):
|
||||||
|
name = 'zjjt'
|
||||||
|
# allowed_domains = ['www.xxx.com']
|
||||||
|
start_urls = ['http://www.zjvtit.edu.cn/']
|
||||||
|
models_urls = [] # 存放板块的url
|
||||||
|
|
||||||
|
# 实例化浏览器对象
|
||||||
|
# def __init__(self):
|
||||||
|
# self.bro = webdriver.Chrome(executable_path='D:\pythondemo\crawler\chromedriver.exe')
|
||||||
|
|
||||||
|
# 解析各个学院url
|
||||||
|
def parse(self, response):
|
||||||
|
li_list = response.xpath('/html/body/div[7]/div/ul/li')
|
||||||
|
# 选出所需要的板块
|
||||||
|
alist = [0, 1, 2, 3, 4, 5, 6, 7]
|
||||||
|
for index in alist:
|
||||||
|
module_url = li_list[index].xpath('./a/@href').extract_first()
|
||||||
|
# print(module_url)
|
||||||
|
self.models_urls.append(module_url)
|
||||||
|
|
||||||
|
# 依次对每个板块对应的Url发送请求
|
||||||
|
for url in self.models_urls:
|
||||||
|
yield scrapy.Request(url, callback=self.parse_model, meta={'index': index, 'mokuai': url})
|
||||||
|
|
||||||
|
# 解析每个板块中新闻标题对应的详情页内容
|
||||||
|
def parse_model(self, response):
|
||||||
|
xy_index = response.meta['index']
|
||||||
|
xy_url = response.meta['mokuai']
|
||||||
|
|
||||||
|
if xy_index == 0:
|
||||||
|
i = 4600
|
||||||
|
news_detail_urls = []
|
||||||
|
while i >= 4400:
|
||||||
|
info_list = [1064, ]
|
||||||
|
for index in info_list:
|
||||||
|
news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
|
||||||
|
news_detail_urls.append(news_detail_url)
|
||||||
|
# print(news_detail_url)
|
||||||
|
i -= 1
|
||||||
|
# print(news_detail_urls)
|
||||||
|
for url in news_detail_urls:
|
||||||
|
yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
|
||||||
|
if xy_index == 1:
|
||||||
|
i = 5900
|
||||||
|
news_detail_urls = []
|
||||||
|
while i >= 5700:
|
||||||
|
info_list = [1071, ]
|
||||||
|
for index in info_list:
|
||||||
|
news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
|
||||||
|
news_detail_urls.append(news_detail_url)
|
||||||
|
# print(news_detail_url)
|
||||||
|
i -= 1
|
||||||
|
# print(news_detail_urls)
|
||||||
|
for url in news_detail_urls:
|
||||||
|
yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
|
||||||
|
elif xy_index == 2:
|
||||||
|
i = 12400
|
||||||
|
news_detail_urls = []
|
||||||
|
while i >= 12300:
|
||||||
|
info_list = [1067, ]
|
||||||
|
for index in info_list:
|
||||||
|
news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
|
||||||
|
news_detail_urls.append(news_detail_url)
|
||||||
|
# print(news_detail_url)
|
||||||
|
i -= 1
|
||||||
|
# print(news_detail_urls)
|
||||||
|
for url in news_detail_urls:
|
||||||
|
yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
|
||||||
|
elif xy_index == 3:
|
||||||
|
i = 4550
|
||||||
|
news_detail_urls = []
|
||||||
|
while i >= 4400:
|
||||||
|
info_list = [1023, 1028, ]
|
||||||
|
for index in info_list:
|
||||||
|
news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
|
||||||
|
news_detail_urls.append(news_detail_url)
|
||||||
|
# print(news_detail_url)
|
||||||
|
i -= 1
|
||||||
|
# print(news_detail_urls)
|
||||||
|
for url in news_detail_urls:
|
||||||
|
yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
|
||||||
|
elif xy_index == 4:
|
||||||
|
i = 5200
|
||||||
|
news_detail_urls = []
|
||||||
|
while i >= 5000:
|
||||||
|
info_list = [1088, 1089, ]
|
||||||
|
for index in info_list:
|
||||||
|
news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
|
||||||
|
news_detail_urls.append(news_detail_url)
|
||||||
|
# print(news_detail_url)
|
||||||
|
i -= 1
|
||||||
|
# print(news_detail_urls)
|
||||||
|
for url in news_detail_urls:
|
||||||
|
yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
|
||||||
|
elif xy_index == 5:
|
||||||
|
i = 7700
|
||||||
|
news_detail_urls = []
|
||||||
|
while i >= 7500:
|
||||||
|
info_list = [1166, 1167, ]
|
||||||
|
for index in info_list:
|
||||||
|
news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
|
||||||
|
news_detail_urls.append(news_detail_url)
|
||||||
|
# print(news_detail_url)
|
||||||
|
i -= 1
|
||||||
|
# print(news_detail_urls)
|
||||||
|
for url in news_detail_urls:
|
||||||
|
yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
|
||||||
|
elif xy_index == 6:
|
||||||
|
i = 9900
|
||||||
|
news_detail_urls = []
|
||||||
|
while i >= 9700:
|
||||||
|
info_list = [1747, 1774, ]
|
||||||
|
for index in info_list:
|
||||||
|
news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
|
||||||
|
news_detail_urls.append(news_detail_url)
|
||||||
|
# print(news_detail_url)
|
||||||
|
i -= 1
|
||||||
|
# print(news_detail_urls)
|
||||||
|
for url in news_detail_urls:
|
||||||
|
yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
|
||||||
|
elif xy_index == 7:
|
||||||
|
i = 5450
|
||||||
|
news_detail_urls = []
|
||||||
|
while i >= 5200:
|
||||||
|
info_list = [1045, 1110, ]
|
||||||
|
for index in info_list:
|
||||||
|
news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
|
||||||
|
news_detail_urls.append(news_detail_url)
|
||||||
|
# print(news_detail_url)
|
||||||
|
i -= 1
|
||||||
|
# print(news_detail_urls)
|
||||||
|
for url in news_detail_urls:
|
||||||
|
yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
|
||||||
|
|
||||||
|
def parse_detail(self, response):
|
||||||
|
url = response.meta['detail']
|
||||||
|
xy_url = response.meta['xy_url']
|
||||||
|
if response:
|
||||||
|
newsid = url.split('/')[-2] + url.split('/')[-1].split('.')[0]
|
||||||
|
title = response.xpath('//form[@name="_newscontent_fromname"]/div/h2/text() | '
|
||||||
|
'//form[@name="_newscontent_fromname"]/div/h3/text() | '
|
||||||
|
'//td[@class="titlestyle112217"]/text() | '
|
||||||
|
'//form[@name="_newscontent_fromname"]/ul/h1/text() | '
|
||||||
|
'/html/body/table[4]/tbody/tr/td[2]/table[2]/tbody/tr/td/div/h1/text()').extract_first()
|
||||||
|
date = response.xpath('//form[@name="_newscontent_fromname"]/div/div/em/text() | '
|
||||||
|
'//form[@name="_newscontent_fromname"]/ul/div[1]/text() | '
|
||||||
|
'/html/body/table[4]/tbody/tr/td[2]/table[2]/tbody/tr/td/div/div/text()').extract_first()
|
||||||
|
content = response.xpath('//*[@id="vsb_content_2"]/div/p//text() | '
|
||||||
|
'//*[@id="vsb_content"]/div/p//text() | '
|
||||||
|
'//*[@id="vsb_content_4"]/div/p//text() | '
|
||||||
|
'//*[@id="vsb_content_2"]/div/div/p//text() | '
|
||||||
|
'//*[@class="content_set"]//text()').extract()
|
||||||
|
content = ''.join(content)
|
||||||
|
src = response.xpath(
|
||||||
|
'//*[@id="vsb_content_2"]//img/@src | '
|
||||||
|
'//*[@id="vsb_content"]//img/@src | '
|
||||||
|
'//*[@id="vsb_content_4"]/div//img/@src | '
|
||||||
|
'//*[@class="content_set"]/div//img/@src').extract_first()
|
||||||
|
img_src = ''
|
||||||
|
if src:
|
||||||
|
img_src = xy_url + src
|
||||||
|
|
||||||
|
# print(newsid, content)
|
||||||
|
|
||||||
|
item = ZjjtproItem()
|
||||||
|
item['index'] = response.meta['index']
|
||||||
|
item['title'] = title
|
||||||
|
item['date'] = date
|
||||||
|
item['content'] = content
|
||||||
|
item['news_id'] = newsid
|
||||||
|
item['img_src'] = img_src
|
||||||
|
|
||||||
|
yield item
|
||||||
|
|
||||||
|
# def close(self, spider):
|
||||||
|
# self.bro.quit()
|
Loading…
Reference in New Issue