From 30bcca17d01f52f341b2c5082980fb66f9588115 Mon Sep 17 00:00:00 2001 From: link_1999 <1402246900@qq.com> Date: Tue, 3 May 2022 22:09:04 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?= =?UTF-8?q?=20'zjjtPro/spiders'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 浙江交通职业技术学院各个学院的新闻爬取 --- zjjtPro/spiders/__init__.py | 4 + zjjtPro/spiders/zjjt.py | 182 ++++++++++++++++++++++++++++++++++++ 2 files changed, 186 insertions(+) create mode 100644 zjjtPro/spiders/__init__.py create mode 100644 zjjtPro/spiders/zjjt.py diff --git a/zjjtPro/spiders/__init__.py b/zjjtPro/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/zjjtPro/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/zjjtPro/spiders/zjjt.py b/zjjtPro/spiders/zjjt.py new file mode 100644 index 0000000..7e84aa4 --- /dev/null +++ b/zjjtPro/spiders/zjjt.py @@ -0,0 +1,182 @@ +import scrapy +from selenium import webdriver +from zjjtPro.items import ZjjtproItem +import re + + +class ZjjtSpider(scrapy.Spider): + name = 'zjjt' + # allowed_domains = ['www.xxx.com'] + start_urls = ['http://www.zjvtit.edu.cn/'] + models_urls = [] # 存放板块的url + + # 实例化浏览器对象 + # def __init__(self): + # self.bro = webdriver.Chrome(executable_path='D:\pythondemo\crawler\chromedriver.exe') + + # 解析各个学院url + def parse(self, response): + li_list = response.xpath('/html/body/div[7]/div/ul/li') + # 选出所需要的板块 + alist = [0, 1, 2, 3, 4, 5, 6, 7] + for index in alist: + module_url = li_list[index].xpath('./a/@href').extract_first() + # print(module_url) + self.models_urls.append(module_url) + + # 依次对每个板块对应的Url发送请求 + for url in self.models_urls: + yield scrapy.Request(url, callback=self.parse_model, meta={'index': index, 'mokuai': url}) + + # 解析每个板块中新闻标题对应的详情页内容 + def parse_model(self, response): + xy_index = response.meta['index'] + xy_url = response.meta['mokuai'] + + if xy_index == 0: + i = 4600 + news_detail_urls = [] + while i >= 4400: + info_list = [1064, ] + for index in info_list: + news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm' + news_detail_urls.append(news_detail_url) + # print(news_detail_url) + i -= 1 + # print(news_detail_urls) + for url in news_detail_urls: + yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url}) + if xy_index == 1: + i = 5900 + news_detail_urls = [] + while i >= 5700: + info_list = [1071, ] + for index in info_list: + news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm' + news_detail_urls.append(news_detail_url) + # print(news_detail_url) + i -= 1 + # print(news_detail_urls) + for url in news_detail_urls: + yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url}) + elif xy_index == 2: + i = 12400 + news_detail_urls = [] + while i >= 12300: + info_list = [1067, ] + for index in info_list: + news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm' + news_detail_urls.append(news_detail_url) + # print(news_detail_url) + i -= 1 + # print(news_detail_urls) + for url in news_detail_urls: + yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url}) + elif xy_index == 3: + i = 4550 + news_detail_urls = [] + while i >= 4400: + info_list = [1023, 1028, ] + for index in info_list: + news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm' + news_detail_urls.append(news_detail_url) + # print(news_detail_url) + i -= 1 + # print(news_detail_urls) + for url in news_detail_urls: + yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url}) + elif xy_index == 4: + i = 5200 + news_detail_urls = [] + while i >= 5000: + info_list = [1088, 1089, ] + for index in info_list: + news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm' + news_detail_urls.append(news_detail_url) + # print(news_detail_url) + i -= 1 + # print(news_detail_urls) + for url in news_detail_urls: + yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url}) + elif xy_index == 5: + i = 7700 + news_detail_urls = [] + while i >= 7500: + info_list = [1166, 1167, ] + for index in info_list: + news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm' + news_detail_urls.append(news_detail_url) + # print(news_detail_url) + i -= 1 + # print(news_detail_urls) + for url in news_detail_urls: + yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url}) + elif xy_index == 6: + i = 9900 + news_detail_urls = [] + while i >= 9700: + info_list = [1747, 1774, ] + for index in info_list: + news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm' + news_detail_urls.append(news_detail_url) + # print(news_detail_url) + i -= 1 + # print(news_detail_urls) + for url in news_detail_urls: + yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url}) + elif xy_index == 7: + i = 5450 + news_detail_urls = [] + while i >= 5200: + info_list = [1045, 1110, ] + for index in info_list: + news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm' + news_detail_urls.append(news_detail_url) + # print(news_detail_url) + i -= 1 + # print(news_detail_urls) + for url in news_detail_urls: + yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url}) + + def parse_detail(self, response): + url = response.meta['detail'] + xy_url = response.meta['xy_url'] + if response: + newsid = url.split('/')[-2] + url.split('/')[-1].split('.')[0] + title = response.xpath('//form[@name="_newscontent_fromname"]/div/h2/text() | ' + '//form[@name="_newscontent_fromname"]/div/h3/text() | ' + '//td[@class="titlestyle112217"]/text() | ' + '//form[@name="_newscontent_fromname"]/ul/h1/text() | ' + '/html/body/table[4]/tbody/tr/td[2]/table[2]/tbody/tr/td/div/h1/text()').extract_first() + date = response.xpath('//form[@name="_newscontent_fromname"]/div/div/em/text() | ' + '//form[@name="_newscontent_fromname"]/ul/div[1]/text() | ' + '/html/body/table[4]/tbody/tr/td[2]/table[2]/tbody/tr/td/div/div/text()').extract_first() + content = response.xpath('//*[@id="vsb_content_2"]/div/p//text() | ' + '//*[@id="vsb_content"]/div/p//text() | ' + '//*[@id="vsb_content_4"]/div/p//text() | ' + '//*[@id="vsb_content_2"]/div/div/p//text() | ' + '//*[@class="content_set"]//text()').extract() + content = ''.join(content) + src = response.xpath( + '//*[@id="vsb_content_2"]//img/@src | ' + '//*[@id="vsb_content"]//img/@src | ' + '//*[@id="vsb_content_4"]/div//img/@src | ' + '//*[@class="content_set"]/div//img/@src').extract_first() + img_src = '' + if src: + img_src = xy_url + src + + # print(newsid, content) + + item = ZjjtproItem() + item['index'] = response.meta['index'] + item['title'] = title + item['date'] = date + item['content'] = content + item['news_id'] = newsid + item['img_src'] = img_src + + yield item + + # def close(self, spider): + # self.bro.quit()