diff --git a/dagongPro/__init__.py b/dagongPro/__init__.py new file mode 100644 index 0000000..06d7405 Binary files /dev/null and b/dagongPro/__init__.py differ diff --git a/dagongPro/items.py b/dagongPro/items.py new file mode 100644 index 0000000..8b4fa8e --- /dev/null +++ b/dagongPro/items.py @@ -0,0 +1,12 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class DagongwangproItem(scrapy.Item): + title = scrapy.Field() + content = scrapy.Field() + date = scrapy.Field() diff --git a/dagongPro/middlewares.py b/dagongPro/middlewares.py new file mode 100644 index 0000000..5724d31 --- /dev/null +++ b/dagongPro/middlewares.py @@ -0,0 +1,61 @@ +from scrapy import signals +from itemadapter import is_item, ItemAdapter +from scrapy.http import HtmlResponse +from time import sleep +import random + + +class DagongwangproDownloaderMiddleware: + user_agent_list = [ + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60', + 'Opera/8.0 (Windows NT 5.1; U; en)', + 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', + 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50', + 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', + 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', + 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', + 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER', + 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', + 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0', + 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0) ', + ] + PROXY_http = ['103.140.126.57:8888', + '182.34.254.39:25624', + '39.108.101.55:1080', + '183.247.199.126:30001', + '39.175.75.5:30001'] + PROXY_https = [] + + def process_request(self, request, spider): + # UA伪装 + request.headers['User-Agent'] = random.choice(self.user_agent_list) + # return None + request.headers['proxy'] = random.choice(self.PROXY_http) + return None + + def process_response(self, request, response, spider): + bro = spider.bro # 获取爬虫中定义的浏览器对象 + bro.get(request.url) # 对五大板块对应url发起请求 + sleep(2) + page_text = bro.page_source + + if request.url in spider.models_urls: + new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request) + return new_response + else: + return response # 其他请求对应的响应对象 + + def process_exception(self, request, exception, spider): + # 代理 + if request.url.split(':')[0] == 'http': + request.meta['proxy'] = 'http://' + random.choice(self.PROXY_http) + else: + request.meta['proxy'] = 'https://' + random.choice(self.PROXY_https) + # 将修正之后的请求对象重新发送 + return request diff --git a/dagongPro/pipelines.py b/dagongPro/pipelines.py new file mode 100644 index 0000000..096dec2 --- /dev/null +++ b/dagongPro/pipelines.py @@ -0,0 +1,24 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter + + +class DagongwangproPipeline: + + def process_item(self, item, spider): + # print(item) + + title = item['title'] + content = item['content'] + date = item['date'] + + news_path = './新闻/国际新闻/' + title + '.txt' + with open(news_path, 'w', encoding='utf-8') as fp: + fp.write(date+'\n'+content) + + return item diff --git a/dagongPro/settings.py b/dagongPro/settings.py new file mode 100644 index 0000000..d66c1cb --- /dev/null +++ b/dagongPro/settings.py @@ -0,0 +1,89 @@ +# Scrapy settings for dagongwangPro project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'dagongwangPro' + +SPIDER_MODULES = ['dagongwangPro.spiders'] +NEWSPIDER_MODULE = 'dagongwangPro.spiders' + + +LOG_LEVEL = 'ERROR' +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'dagongwangPro.middlewares.DagongwangproSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +DOWNLOADER_MIDDLEWARES = { + 'dagongwangPro.middlewares.DagongwangproDownloaderMiddleware': 543, +} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'dagongwangPro.pipelines.DagongwangproPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'