From 30bcca17d01f52f341b2c5082980fb66f9588115 Mon Sep 17 00:00:00 2001
From: link_1999 <1402246900@qq.com>
Date: Tue, 3 May 2022 22:09:04 +0800
Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6=E8=87=B3?=
 =?UTF-8?q?=20'zjjtPro/spiders'?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

浙江交通职业技术学院各个学院的新闻爬取
---
 zjjtPro/spiders/__init__.py |   4 +
 zjjtPro/spiders/zjjt.py     | 182 ++++++++++++++++++++++++++++++++++++
 2 files changed, 186 insertions(+)
 create mode 100644 zjjtPro/spiders/__init__.py
 create mode 100644 zjjtPro/spiders/zjjt.py

diff --git a/zjjtPro/spiders/__init__.py b/zjjtPro/spiders/__init__.py
new file mode 100644
index 0000000..ebd689a
--- /dev/null
+++ b/zjjtPro/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/zjjtPro/spiders/zjjt.py b/zjjtPro/spiders/zjjt.py
new file mode 100644
index 0000000..7e84aa4
--- /dev/null
+++ b/zjjtPro/spiders/zjjt.py
@@ -0,0 +1,182 @@
+import scrapy
+from selenium import webdriver
+from zjjtPro.items import ZjjtproItem
+import re
+
+
+class ZjjtSpider(scrapy.Spider):
+    name = 'zjjt'
+    # allowed_domains = ['www.xxx.com']
+    start_urls = ['http://www.zjvtit.edu.cn/']
+    models_urls = []  # 存放板块的url
+
+    # 实例化浏览器对象
+    # def __init__(self):
+    #     self.bro = webdriver.Chrome(executable_path='D:\pythondemo\crawler\chromedriver.exe')
+
+    # 解析各个学院url
+    def parse(self, response):
+        li_list = response.xpath('/html/body/div[7]/div/ul/li')
+        # 选出所需要的板块
+        alist = [0, 1, 2, 3, 4, 5, 6, 7]
+        for index in alist:
+            module_url = li_list[index].xpath('./a/@href').extract_first()
+            # print(module_url)
+            self.models_urls.append(module_url)
+
+            # 依次对每个板块对应的Url发送请求
+            for url in self.models_urls:
+                yield scrapy.Request(url, callback=self.parse_model, meta={'index': index, 'mokuai': url})
+
+    # 解析每个板块中新闻标题对应的详情页内容
+    def parse_model(self, response):
+        xy_index = response.meta['index']
+        xy_url = response.meta['mokuai']
+
+        if xy_index == 0:
+            i = 4600
+            news_detail_urls = []
+            while i >= 4400:
+                info_list = [1064, ]
+                for index in info_list:
+                    news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
+                    news_detail_urls.append(news_detail_url)
+                    # print(news_detail_url)
+                i -= 1
+            # print(news_detail_urls)
+            for url in news_detail_urls:
+                yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
+        if xy_index == 1:
+            i = 5900
+            news_detail_urls = []
+            while i >= 5700:
+                info_list = [1071, ]
+                for index in info_list:
+                    news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
+                    news_detail_urls.append(news_detail_url)
+                    # print(news_detail_url)
+                i -= 1
+            # print(news_detail_urls)
+            for url in news_detail_urls:
+                yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
+        elif xy_index == 2:
+            i = 12400
+            news_detail_urls = []
+            while i >= 12300:
+                info_list = [1067, ]
+                for index in info_list:
+                    news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
+                    news_detail_urls.append(news_detail_url)
+                    # print(news_detail_url)
+                i -= 1
+            # print(news_detail_urls)
+            for url in news_detail_urls:
+                yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
+        elif xy_index == 3:
+            i = 4550
+            news_detail_urls = []
+            while i >= 4400:
+                info_list = [1023, 1028, ]
+                for index in info_list:
+                    news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
+                    news_detail_urls.append(news_detail_url)
+                    # print(news_detail_url)
+                i -= 1
+            # print(news_detail_urls)
+            for url in news_detail_urls:
+                yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
+        elif xy_index == 4:
+            i = 5200
+            news_detail_urls = []
+            while i >= 5000:
+                info_list = [1088, 1089, ]
+                for index in info_list:
+                    news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
+                    news_detail_urls.append(news_detail_url)
+                    # print(news_detail_url)
+                i -= 1
+            # print(news_detail_urls)
+            for url in news_detail_urls:
+                yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
+        elif xy_index == 5:
+            i = 7700
+            news_detail_urls = []
+            while i >= 7500:
+                info_list = [1166, 1167, ]
+                for index in info_list:
+                    news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
+                    news_detail_urls.append(news_detail_url)
+                    # print(news_detail_url)
+                i -= 1
+            # print(news_detail_urls)
+            for url in news_detail_urls:
+                yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
+        elif xy_index == 6:
+            i = 9900
+            news_detail_urls = []
+            while i >= 9700:
+                info_list = [1747, 1774, ]
+                for index in info_list:
+                    news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
+                    news_detail_urls.append(news_detail_url)
+                    # print(news_detail_url)
+                i -= 1
+            # print(news_detail_urls)
+            for url in news_detail_urls:
+                yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
+        elif xy_index == 7:
+            i = 5450
+            news_detail_urls = []
+            while i >= 5200:
+                info_list = [1045, 1110, ]
+                for index in info_list:
+                    news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm'
+                    news_detail_urls.append(news_detail_url)
+                    # print(news_detail_url)
+                i -= 1
+            # print(news_detail_urls)
+            for url in news_detail_urls:
+                yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url})
+
+    def parse_detail(self, response):
+        url = response.meta['detail']
+        xy_url = response.meta['xy_url']
+        if response:
+            newsid = url.split('/')[-2] + url.split('/')[-1].split('.')[0]
+            title = response.xpath('//form[@name="_newscontent_fromname"]/div/h2/text() | '
+                                   '//form[@name="_newscontent_fromname"]/div/h3/text() | '
+                                   '//td[@class="titlestyle112217"]/text() | '
+                                   '//form[@name="_newscontent_fromname"]/ul/h1/text() | '
+                                   '/html/body/table[4]/tbody/tr/td[2]/table[2]/tbody/tr/td/div/h1/text()').extract_first()
+            date = response.xpath('//form[@name="_newscontent_fromname"]/div/div/em/text() | '
+                                  '//form[@name="_newscontent_fromname"]/ul/div[1]/text() | '
+                                  '/html/body/table[4]/tbody/tr/td[2]/table[2]/tbody/tr/td/div/div/text()').extract_first()
+            content = response.xpath('//*[@id="vsb_content_2"]/div/p//text() | '
+                                     '//*[@id="vsb_content"]/div/p//text() | '
+                                     '//*[@id="vsb_content_4"]/div/p//text() | '
+                                     '//*[@id="vsb_content_2"]/div/div/p//text() | '
+                                     '//*[@class="content_set"]//text()').extract()
+            content = ''.join(content)
+            src = response.xpath(
+                '//*[@id="vsb_content_2"]//img/@src | '
+                '//*[@id="vsb_content"]//img/@src | '
+                '//*[@id="vsb_content_4"]/div//img/@src | '
+                '//*[@class="content_set"]/div//img/@src').extract_first()
+            img_src = ''
+            if src:
+                img_src = xy_url + src
+
+            # print(newsid, content)
+
+            item = ZjjtproItem()
+            item['index'] = response.meta['index']
+            item['title'] = title
+            item['date'] = date
+            item['content'] = content
+            item['news_id'] = newsid
+            item['img_src'] = img_src
+
+            yield item
+
+    # def close(self, spider):
+    #     self.bro.quit()