import scrapy from selenium import webdriver from zjjtPro.items import ZjjtproItem import re class ZjjtSpider(scrapy.Spider): name = 'zjjt' # allowed_domains = ['www.xxx.com'] start_urls = ['http://www.zjvtit.edu.cn/'] models_urls = [] # 存放板块的url # 实例化浏览器对象 # def __init__(self): # self.bro = webdriver.Chrome(executable_path='D:\pythondemo\crawler\chromedriver.exe') # 解析各个学院url def parse(self, response): li_list = response.xpath('/html/body/div[7]/div/ul/li') # 选出所需要的板块 alist = [0, 1, 2, 3, 4, 5, 6, 7] for index in alist: module_url = li_list[index].xpath('./a/@href').extract_first() # print(module_url) self.models_urls.append(module_url) # 依次对每个板块对应的Url发送请求 for url in self.models_urls: yield scrapy.Request(url, callback=self.parse_model, meta={'index': index, 'mokuai': url}) # 解析每个板块中新闻标题对应的详情页内容 def parse_model(self, response): xy_index = response.meta['index'] xy_url = response.meta['mokuai'] if xy_index == 0: i = 4600 news_detail_urls = [] while i >= 4400: info_list = [1064, ] for index in info_list: news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm' news_detail_urls.append(news_detail_url) # print(news_detail_url) i -= 1 # print(news_detail_urls) for url in news_detail_urls: yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url}) if xy_index == 1: i = 5900 news_detail_urls = [] while i >= 5700: info_list = [1071, ] for index in info_list: news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm' news_detail_urls.append(news_detail_url) # print(news_detail_url) i -= 1 # print(news_detail_urls) for url in news_detail_urls: yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url}) elif xy_index == 2: i = 12400 news_detail_urls = [] while i >= 12300: info_list = [1067, ] for index in info_list: news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm' news_detail_urls.append(news_detail_url) # print(news_detail_url) i -= 1 # print(news_detail_urls) for url in news_detail_urls: yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url}) elif xy_index == 3: i = 4550 news_detail_urls = [] while i >= 4400: info_list = [1023, 1028, ] for index in info_list: news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm' news_detail_urls.append(news_detail_url) # print(news_detail_url) i -= 1 # print(news_detail_urls) for url in news_detail_urls: yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url}) elif xy_index == 4: i = 5200 news_detail_urls = [] while i >= 5000: info_list = [1088, 1089, ] for index in info_list: news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm' news_detail_urls.append(news_detail_url) # print(news_detail_url) i -= 1 # print(news_detail_urls) for url in news_detail_urls: yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url}) elif xy_index == 5: i = 7700 news_detail_urls = [] while i >= 7500: info_list = [1166, 1167, ] for index in info_list: news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm' news_detail_urls.append(news_detail_url) # print(news_detail_url) i -= 1 # print(news_detail_urls) for url in news_detail_urls: yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url}) elif xy_index == 6: i = 9900 news_detail_urls = [] while i >= 9700: info_list = [1747, 1774, ] for index in info_list: news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm' news_detail_urls.append(news_detail_url) # print(news_detail_url) i -= 1 # print(news_detail_urls) for url in news_detail_urls: yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url}) elif xy_index == 7: i = 5450 news_detail_urls = [] while i >= 5200: info_list = [1045, 1110, ] for index in info_list: news_detail_url = xy_url + '/info/' + str(index) + '/' + str(i) + '.htm' news_detail_urls.append(news_detail_url) # print(news_detail_url) i -= 1 # print(news_detail_urls) for url in news_detail_urls: yield scrapy.Request(url, callback=self.parse_detail, meta={'index': xy_index, 'detail': url, 'xy_url': xy_url}) def parse_detail(self, response): url = response.meta['detail'] xy_url = response.meta['xy_url'] if response: newsid = url.split('/')[-2] + url.split('/')[-1].split('.')[0] title = response.xpath('//form[@name="_newscontent_fromname"]/div/h2/text() | ' '//form[@name="_newscontent_fromname"]/div/h3/text() | ' '//td[@class="titlestyle112217"]/text() | ' '//form[@name="_newscontent_fromname"]/ul/h1/text() | ' '/html/body/table[4]/tbody/tr/td[2]/table[2]/tbody/tr/td/div/h1/text()').extract_first() date = response.xpath('//form[@name="_newscontent_fromname"]/div/div/em/text() | ' '//form[@name="_newscontent_fromname"]/ul/div[1]/text() | ' '/html/body/table[4]/tbody/tr/td[2]/table[2]/tbody/tr/td/div/div/text()').extract_first() content = response.xpath('//*[@id="vsb_content_2"]/div/p//text() | ' '//*[@id="vsb_content"]/div/p//text() | ' '//*[@id="vsb_content_4"]/div/p//text() | ' '//*[@id="vsb_content_2"]/div/div/p//text() | ' '//*[@class="content_set"]//text()').extract() content = ''.join(content) src = response.xpath( '//*[@id="vsb_content_2"]//img/@src | ' '//*[@id="vsb_content"]//img/@src | ' '//*[@id="vsb_content_4"]/div//img/@src | ' '//*[@class="content_set"]/div//img/@src').extract_first() img_src = '' if src: img_src = xy_url + src # print(newsid, content) item = ZjjtproItem() item['index'] = response.meta['index'] item['title'] = title item['date'] = date item['content'] = content item['news_id'] = newsid item['img_src'] = img_src yield item # def close(self, spider): # self.bro.quit()