上传文件至 'dagongPro'

2022-04-04 21:42:25 +08:00 · 2022-04-04 21:42:25 +08:00 · 7eb432981c
parent 49af5c71b8
commit 7eb432981c
5 changed files with 186 additions and 0 deletions
--- a/dagongPro/init.py
+++ b/dagongPro/init.py
--- a/dagongPro/items.py
+++ b/dagongPro/items.py
@ -0,0 +1,12 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class DagongwangproItem(scrapy.Item):
+    title = scrapy.Field()
+    content = scrapy.Field()
+    date = scrapy.Field()
--- a/dagongPro/middlewares.py
+++ b/dagongPro/middlewares.py
@ -0,0 +1,61 @@
+from scrapy import signals
+from itemadapter import is_item, ItemAdapter
+from scrapy.http import HtmlResponse
+from time import sleep
+import random
+
+
+class DagongwangproDownloaderMiddleware:
+    user_agent_list = [
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
+        'Opera/8.0 (Windows NT 5.1; U; en)',
+        'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
+        'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
+        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
+        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
+        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
+        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
+        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
+        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0) ',
+    ]
+    PROXY_http = ['103.140.126.57:8888',
+                  '182.34.254.39:25624',
+                  '39.108.101.55:1080',
+                  '183.247.199.126:30001',
+                  '39.175.75.5:30001']
+    PROXY_https = []
+
+    def process_request(self, request, spider):
+        # UA伪装
+        request.headers['User-Agent'] = random.choice(self.user_agent_list)
+        # return None
+        request.headers['proxy'] = random.choice(self.PROXY_http)
+        return None
+
+    def process_response(self, request, response, spider):
+        bro = spider.bro  # 获取爬虫中定义的浏览器对象
+        bro.get(request.url)  # 对五大板块对应url发起请求
+        sleep(2)
+        page_text = bro.page_source
+
+        if request.url in spider.models_urls:
+            new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request)
+            return new_response
+        else:
+            return response  # 其他请求对应的响应对象
+
+    def process_exception(self, request, exception, spider):
+        # 代理
+        if request.url.split(':')[0] == 'http':
+            request.meta['proxy'] = 'http://' + random.choice(self.PROXY_http)
+        else:
+            request.meta['proxy'] = 'https://' + random.choice(self.PROXY_https)
+        # 将修正之后的请求对象重新发送
+        return request
--- a/dagongPro/pipelines.py
+++ b/dagongPro/pipelines.py
@ -0,0 +1,24 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+
+
+class DagongwangproPipeline:
+
+    def process_item(self, item, spider):
+        # print(item)
+
+        title = item['title']
+        content = item['content']
+        date = item['date']
+
+        news_path = './新闻/国际新闻/' + title + '.txt'
+        with open(news_path, 'w', encoding='utf-8') as fp:
+            fp.write(date+'\n'+content)
+
+        return item
--- a/dagongPro/settings.py
+++ b/dagongPro/settings.py
@ -0,0 +1,89 @@
+# Scrapy settings for dagongwangPro project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'dagongwangPro'
+
+SPIDER_MODULES = ['dagongwangPro.spiders']
+NEWSPIDER_MODULE = 'dagongwangPro.spiders'
+
+
+LOG_LEVEL = 'ERROR'
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'dagongwangPro.middlewares.DagongwangproSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+DOWNLOADER_MIDDLEWARES = {
+   'dagongwangPro.middlewares.DagongwangproDownloaderMiddleware': 543,
+}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'dagongwangPro.pipelines.DagongwangproPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'