上传文件至 'zjjtPro/spiders'

浙江交通职业技术学院各个学院的新闻爬取
This commit is contained in:
link_1999 2022-05-03 22:09:04 +08:00
parent 8943cfa7c6
commit 30bcca17d0
2 changed files with 186 additions and 0 deletions

View File

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

182
zjjtPro/spiders/zjjt.py Normal file
View File

@ -0,0 +1,182 @@
import scrapy
from selenium import webdriver
from zjjtPro.items import ZjjtproItem
import re
class ZjjtSpider(scrapy.Spider):
name = 'zjjt'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://www.zjvtit.edu.cn/']
models_urls = [] # 存放板块的url
# 实例化浏览器对象
# def __init__(self):
# self.bro = webdriver.Chrome(executable_path='D:\pythondemo\crawler\chromedriver.exe')
# 解析各个学院url
def parse(self, response):
li_list = response.xpath('/html/body/div[7]/div/ul/li')
# 选出所需要的板块
alist = [0, 1, 2, 3, 4, 5, 6, 7]
for index in alist:
module_url = li_list[index].xpath('./a/@href').extract_first()
# print(module_url)
self.models_urls.append(module_url)
# 依次对每个板块对应的Url发送请求
for url in self.models_urls:
yield scrapy.Request(url, callback=self.parse_model, meta={'index': index, 'mokuai': url})
# 解析每个板块中新闻标题对应的详情页内容
def parse_model(self, response):
xy_index = response.meta['index']
xy_url = response.meta['mokuai']
if xy_index == 0:
i = 4600
news_detail_urls = []
while i >= 4400:
info_list = [1064, ]
for index in info_list:
news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
news_detail_urls.append(news_detail_url)
# print(news_detail_url)
i -= 1
# print(news_detail_urls)
for url in news_detail_urls:
yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
if xy_index == 1:
i = 5900
news_detail_urls = []
while i >= 5700:
info_list = [1071, ]
for index in info_list:
news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
news_detail_urls.append(news_detail_url)
# print(news_detail_url)
i -= 1
# print(news_detail_urls)
for url in news_detail_urls:
yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
elif xy_index == 2:
i = 12400
news_detail_urls = []
while i >= 12300:
info_list = [1067, ]
for index in info_list:
news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
news_detail_urls.append(news_detail_url)
# print(news_detail_url)
i -= 1
# print(news_detail_urls)
for url in news_detail_urls:
yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
elif xy_index == 3:
i = 4550
news_detail_urls = []
while i >= 4400:
info_list = [1023, 1028, ]
for index in info_list:
news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
news_detail_urls.append(news_detail_url)
# print(news_detail_url)
i -= 1
# print(news_detail_urls)
for url in news_detail_urls:
yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
elif xy_index == 4:
i = 5200
news_detail_urls = []
while i >= 5000:
info_list = [1088, 1089, ]
for index in info_list:
news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
news_detail_urls.append(news_detail_url)
# print(news_detail_url)
i -= 1
# print(news_detail_urls)
for url in news_detail_urls:
yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
elif xy_index == 5:
i = 7700
news_detail_urls = []
while i >= 7500:
info_list = [1166, 1167, ]
for index in info_list:
news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
news_detail_urls.append(news_detail_url)
# print(news_detail_url)
i -= 1
# print(news_detail_urls)
for url in news_detail_urls:
yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
elif xy_index == 6:
i = 9900
news_detail_urls = []
while i >= 9700:
info_list = [1747, 1774, ]
for index in info_list:
news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
news_detail_urls.append(news_detail_url)
# print(news_detail_url)
i -= 1
# print(news_detail_urls)
for url in news_detail_urls:
yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
elif xy_index == 7:
i = 5450
news_detail_urls = []
while i >= 5200:
info_list = [1045, 1110, ]
for index in info_list:
news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
news_detail_urls.append(news_detail_url)
# print(news_detail_url)
i -= 1
# print(news_detail_urls)
for url in news_detail_urls:
yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
def parse_detail(self, response):
url = response.meta['detail']
xy_url = response.meta['xy_url']
if response:
newsid = url.split('/')[-2] + url.split('/')[-1].split('.')[0]
title = response.xpath('//form[@name="_newscontent_fromname"]/div/h2/text() | '
'//form[@name="_newscontent_fromname"]/div/h3/text() | '
'//td[@class="titlestyle112217"]/text() | '
'//form[@name="_newscontent_fromname"]/ul/h1/text() | '
'/html/body/table[4]/tbody/tr/td[2]/table[2]/tbody/tr/td/div/h1/text()').extract_first()
date = response.xpath('//form[@name="_newscontent_fromname"]/div/div/em/text() | '
'//form[@name="_newscontent_fromname"]/ul/div[1]/text() | '
'/html/body/table[4]/tbody/tr/td[2]/table[2]/tbody/tr/td/div/div/text()').extract_first()
content = response.xpath('//*[@id="vsb_content_2"]/div/p//text() | '
'//*[@id="vsb_content"]/div/p//text() | '
'//*[@id="vsb_content_4"]/div/p//text() | '
'//*[@id="vsb_content_2"]/div/div/p//text() | '
'//*[@class="content_set"]//text()').extract()
content = ''.join(content)
src = response.xpath(
'//*[@id="vsb_content_2"]//img/@src | '
'//*[@id="vsb_content"]//img/@src | '
'//*[@id="vsb_content_4"]/div//img/@src | '
'//*[@class="content_set"]/div//img/@src').extract_first()
img_src = ''
if src:
img_src = xy_url + src
# print(newsid, content)
item = ZjjtproItem()
item['index'] = response.meta['index']
item['title'] = title
item['date'] = date
item['content'] = content
item['news_id'] = newsid
item['img_src'] = img_src
yield item
# def close(self, spider):
# self.bro.quit()