Python 大规模异步新闻爬虫、google翻译、百度翻译、有道翻译、百度指数

雨点打透心脏的1/2处 2022-11-12 04:28 432阅读 0赞

参考:https://www.yuanrenxue.com/crawler/news-crawler-urlpool.html

url_pool.py

  1. # -*- coding: utf-8 -*-
  2. # @Author : 佛祖保佑, 永无 bug
  3. # @Date :
  4. # @File : url_pool.py
  5. # @Software: PyCharm
  6. # @description : XXX
  7. import time
  8. import redis
  9. import pickle
  10. import urllib.parse as urlparse
  11. class UrlDB(object):
  12. """使用 redis 来存储 URL"""
  13. status_failure = b'0'
  14. status_success = b'1'
  15. def __init__(self, db_name):
  16. # self.name = db_name + '.urldb'
  17. # self.db = leveldb.LevelDB(self.name)
  18. self.name = db_name if db_name else 'redis_hashmap'
  19. self.db = redis.StrictRedis()
  20. def set_success(self, url=None):
  21. if isinstance(url, str):
  22. url = url.encode('utf8')
  23. try:
  24. self.db.hset(self.name, url, self.status_success)
  25. status = True
  26. except BaseException as be:
  27. status = False
  28. return status
  29. def set_failure(self, url):
  30. if isinstance(url, str):
  31. url = url.encode('utf8')
  32. try:
  33. self.db.hset(self.name, url, self.status_failure)
  34. status = True
  35. except BaseException as be:
  36. status = False
  37. return status
  38. def has(self, url):
  39. if isinstance(url, str):
  40. url = url.encode('utf8')
  41. try:
  42. attr = self.db.hget(self.name, url)
  43. return attr
  44. except BaseException as be:
  45. pass
  46. return False
  47. class UrlPool(object):
  48. """ 使用 UrlPool 来抓取和管理URLs"""
  49. def __init__(self, pool_name):
  50. self.name = pool_name
  51. self.db = UrlDB(pool_name)
  52. self.waiting = dict() # {host: set([urls]), } 按host分组,记录等待下载的URL
  53. self.pending = dict() # {url: pended_time, } 记录已被取出(self.pop())但还未被更新状态(正在下载)的URL
  54. self.failure = dict() # {url: times,} 记录失败的URL的次数
  55. self.failure_threshold = 3
  56. self.pending_threshold = 10 # pending的最大时间,过期要重新下载
  57. self.waiting_count = 0 # self.waiting 字典里面的url的个数
  58. self.max_hosts = ['', 0] # [host: url_count] 目前pool中url最多的host及其url数量
  59. self.hub_pool = dict() # {url: last_query_time, } 存放hub url
  60. self.hub_refresh_span = 0
  61. self.load_cache()
  62. pass
  63. def __del__(self):
  64. self.dump_cache()
  65. def load_cache(self,):
  66. path = self.name + '.pkl'
  67. try:
  68. with open(path, 'rb') as f:
  69. self.waiting = pickle.load(f)
  70. cc = [len(v) for k, v in self.waiting.items()]
  71. print('saved pool loaded! urls:', sum(cc))
  72. except BaseException as be:
  73. pass
  74. def dump_cache(self):
  75. path = self.name + '.pkl'
  76. try:
  77. with open(path, 'wb') as f:
  78. pickle.dump(self.waiting, f)
  79. print('self.waiting saved!')
  80. except BaseException as be:
  81. pass
  82. def set_hubs(self, urls, hub_refresh_span):
  83. self.hub_refresh_span = hub_refresh_span
  84. self.hub_pool = dict()
  85. for url in urls:
  86. self.hub_pool[url] = 0
  87. def set_status(self, url, status_code):
  88. if url in self.pending:
  89. self.pending.pop(url)
  90. if status_code == 200:
  91. self.db.set_success(url)
  92. return
  93. if status_code == 404:
  94. self.db.set_failure(url)
  95. return
  96. if url in self.failure:
  97. self.failure[url] += 1
  98. if self.failure[url] > self.failure_threshold:
  99. self.db.set_failure(url)
  100. self.failure.pop(url)
  101. else:
  102. self.add(url)
  103. else:
  104. self.failure[url] = 1
  105. self.add(url)
  106. def push_to_pool(self, url=None):
  107. host = urlparse.urlparse(url).netloc
  108. if not host or '.' not in host:
  109. print('try to push_to_pool with bad url:', url, ', len of ur:', len(url))
  110. return False
  111. if host in self.waiting:
  112. if url in self.waiting[host]:
  113. return True
  114. self.waiting[host].add(url)
  115. if len(self.waiting[host]) > self.max_hosts[1]:
  116. self.max_hosts[1] = len(self.waiting[host])
  117. self.max_hosts[0] = host
  118. else:
  119. self.waiting[host] = set([url])
  120. self.waiting_count += 1
  121. return True
  122. def add(self, url=None, always=False):
  123. if always:
  124. return self.push_to_pool(url)
  125. pended_time = self.pending.get(url, 0)
  126. if time.time() - pended_time < self.pending_threshold:
  127. print('being downloading:', url)
  128. return
  129. if self.db.has(url):
  130. return
  131. if pended_time:
  132. self.pending.pop(url)
  133. return self.push_to_pool(url)
  134. def add_many(self, url_list=None, always=False):
  135. if isinstance(url_list, str):
  136. print('urls is a str !!!!', url_list)
  137. self.add(url_list, always)
  138. else:
  139. for url in url_list:
  140. self.add(url, always)
  141. def pop(self, count=None, hub_percent=50):
  142. print('\n\tmax of host:', self.max_hosts)
  143. # 取出的url有两种类型:hub=1, 普通=0
  144. url_attr_url = 0
  145. url_attr_hub = 1
  146. # 1. 首先取出hub,保证获取hub里面的最新url.
  147. hubs = dict()
  148. hub_count = count * hub_percent // 100
  149. for hub in self.hub_pool:
  150. span = time.time() - self.hub_pool[hub]
  151. if span < self.hub_refresh_span:
  152. continue
  153. hubs[hub] = url_attr_hub # 1 means hub-url
  154. self.hub_pool[hub] = time.time()
  155. if len(hubs) >= hub_count:
  156. break
  157. # 2. 再取出普通url
  158. left_count = count - len(hubs)
  159. urls = dict()
  160. for host in self.waiting:
  161. if not self.waiting[host]:
  162. continue
  163. url = self.waiting[host].pop()
  164. urls[url] = url_attr_url
  165. self.pending[url] = time.time()
  166. if self.max_hosts[0] == host:
  167. self.max_hosts[1] -= 1
  168. if len(urls) >= left_count:
  169. break
  170. self.waiting_count -= len(urls)
  171. print('To pop:%s, hubs: %s, urls: %s, hosts:%s' % (count, len(hubs), len(urls), len(self.waiting)))
  172. urls.update(hubs)
  173. return urls
  174. def size(self,):
  175. return self.waiting_count
  176. def empty(self,):
  177. return self.waiting_count == 0
  178. def test():
  179. pool = UrlPool('crawl_url_pool')
  180. urls = [
  181. 'http://1.a.cn/xyz',
  182. 'http://2.a.cn/xyz',
  183. 'http://3.a.cn/xyz',
  184. 'http://1.b.cn/xyz-1',
  185. 'http://1.b.cn/xyz-2',
  186. 'http://1.b.cn/xyz-3',
  187. 'http://1.b.cn/xyz-4',
  188. ]
  189. pool.add_many(urls)
  190. # del pool
  191. # pool = UrlPool('crawl_url_pool')
  192. urls = pool.pop(5)
  193. urls = list(urls.keys())
  194. print('pop:', urls)
  195. print('pending:', pool.pending)
  196. pool.set_status(urls[0], 200)
  197. print('pending:', pool.pending)
  198. pool.set_status(urls[1], 404)
  199. print('pending:', pool.pending)
  200. if __name__ == '__main__':
  201. test()

ezpymysql.py

:大规模异步新闻爬虫: 让MySQL 数据库操作更方便 - 猿人学

  1. # file: ezpymysql.py
  2. # Author: veelion
  3. """A lightweight wrapper around PyMySQL.
  4. only for python3
  5. """
  6. import time
  7. import logging
  8. import traceback
  9. import pymysql
  10. import pymysql.cursors
  11. version = "0.7"
  12. version_info = (0, 7, 0, 0)
  13. class Connection(object):
  14. """A lightweight wrapper around PyMySQL."""
  15. def __init__(self, host, database, user=None, password=None,
  16. port=0, max_idle_time=7 * 3600, connect_timeout=10,
  17. time_zone="+0:00", charset="utf8mb4", sql_mode="TRADITIONAL"):
  18. self.host = host
  19. self.database = database
  20. self.max_idle_time = float(max_idle_time)
  21. args = dict(
  22. use_unicode=True, charset=charset, database=database,
  23. init_command=('SET time_zone = "%s"' % time_zone),
  24. cursorclass=pymysql.cursors.DictCursor,
  25. connect_timeout=connect_timeout, sql_mode=sql_mode
  26. )
  27. if user is not None:
  28. args["user"] = user
  29. if password is not None:
  30. args["passwd"] = password
  31. # We accept a path to a MySQL socket file or a host(:port) string
  32. if "/" in host:
  33. args["unix_socket"] = host
  34. else:
  35. self.socket = None
  36. pair = host.split(":")
  37. if len(pair) == 2:
  38. args["host"] = pair[0]
  39. args["port"] = int(pair[1])
  40. else:
  41. args["host"] = host
  42. args["port"] = 3306
  43. if port:
  44. args['port'] = port
  45. self._db = None
  46. self._db_args = args
  47. self._last_use_time = time.time()
  48. try:
  49. self.reconnect()
  50. except BaseException as be:
  51. logging.error("Cannot connect to MySQL on %s", self.host, exc_info=True)
  52. def _ensure_connected(self):
  53. # Mysql by default closes client connections that are idle for
  54. # 8 hours, but the client library does not report this fact until
  55. # you try to perform a query and it fails. Protect against this
  56. # case by preemptively closing and reopening the connection
  57. # if it has been idle for too long (7 hours by default).
  58. if self._db is None or (time.time() - self._last_use_time > self.max_idle_time):
  59. self.reconnect()
  60. self._last_use_time = time.time()
  61. def _cursor(self):
  62. self._ensure_connected()
  63. return self._db.cursor()
  64. def __del__(self):
  65. self.close()
  66. def close(self):
  67. """Closes this database connection."""
  68. if getattr(self, "_db", None) is not None:
  69. self._db.close()
  70. self._db = None
  71. def reconnect(self):
  72. """Closes the existing database connection and re-opens it."""
  73. self.close()
  74. self._db = pymysql.connect(**self._db_args)
  75. self._db.autocommit(True)
  76. def query(self, query, *parameters, **kwparameters):
  77. """Returns a row list for the given query and parameters."""
  78. cursor = self._cursor()
  79. try:
  80. cursor.execute(query, kwparameters or parameters)
  81. result = cursor.fetchall()
  82. return result
  83. finally:
  84. cursor.close()
  85. def get(self, query, *parameters, **kwparameters):
  86. """Returns the (singular) row returned by the given query.
  87. """
  88. cursor = self._cursor()
  89. try:
  90. cursor.execute(query, kwparameters or parameters)
  91. return cursor.fetchone()
  92. finally:
  93. cursor.close()
  94. def execute(self, query, *parameters, **kwparameters):
  95. """Executes the given query, returning the lastrowid from the query."""
  96. cursor = self._cursor()
  97. try:
  98. cursor.execute(query, kwparameters or parameters)
  99. return cursor.lastrowid
  100. except Exception as e:
  101. if e.args[0] == 1062:
  102. pass
  103. else:
  104. traceback.print_exc()
  105. raise e
  106. finally:
  107. cursor.close()
  108. insert = execute
  109. # =============== high level method for table ===================
  110. def table_has(self, table_name, field, value):
  111. if isinstance(value, str):
  112. value = value.encode('utf8')
  113. sql_str = f'SELECT {field} FROM {table_name} WHERE {field}="{value}"'
  114. d = self.get(sql_str)
  115. return d
  116. def table_insert(self, table_name, item):
  117. """item is a dict : key is mysql table field"""
  118. fields = list(item.keys())
  119. values = list(item.values())
  120. field_str = ','.join(fields)
  121. val_str = ','.join(['%s'] * len(item))
  122. for i in range(len(values)):
  123. if isinstance(values[i], str):
  124. values[i] = values[i].encode('utf8')
  125. sql_str = f'INSERT INTO {table_name} ({field_str}) VALUES({val_str})'
  126. try:
  127. last_id = self.execute(sql_str, *values)
  128. return last_id
  129. except Exception as e:
  130. if e.args[0] == 1062:
  131. # just skip duplicated item
  132. pass
  133. else:
  134. traceback.print_exc()
  135. print('sql:', sql)
  136. print('item:')
  137. for i in range(len(fields)):
  138. vs = str(values[i])
  139. if len(vs) > 300:
  140. print(fields[i], ' : ', len(vs), type(values[i]))
  141. else:
  142. print(fields[i], ' : ', vs, type(values[i]))
  143. raise e
  144. def table_update(self, table_name, updates, field_where, value_where):
  145. """updates is a dict of {field_update:value_update}"""
  146. upsets = []
  147. values = []
  148. for k, v in updates.items():
  149. s = '%s=%%s' % k
  150. upsets.append(s)
  151. values.append(v)
  152. upsets = ','.join(upsets)
  153. sql_str = f'UPDATE {table_name} SET {upsets} WHERE {field_where}="{value_where}"'
  154. self.execute(sql_str, *values)
  155. if __name__ == '__main__':
  156. db = Connection(
  157. 'localhost',
  158. 'db_name',
  159. 'user',
  160. 'password'
  161. )
  162. # 获取一条记录
  163. sql = 'select * from test_table where id=%s'
  164. data = db.get(sql, 2)
  165. # 获取多天记录
  166. sql = 'select * from test_table where id>%s'
  167. data = db.query(sql, 2)
  168. # 插入一条数据
  169. sql = 'insert into test_table(title, url) values(%s, %s)'
  170. last_id = db.execute(sql, 'test', 'http://a.com/')
  171. # 或者
  172. last_id = db.insert(sql, 'test', 'http://a.com/')
  173. # 使用更高级的方法插入一条数据
  174. item = {
  175. 'title': 'test',
  176. 'url': 'http://a.com/',
  177. }
  178. last_id = db.table_insert('test_table', item)

functions.py

  1. # -*- coding: utf-8 -*-
  2. # @Author : 佛祖保佑, 永无 bug
  3. # @Date :
  4. # @File : functions.py
  5. # @Software: PyCharm
  6. # @description : XXX
  7. import re
  8. import requests
  9. import cchardet
  10. import traceback
  11. import urllib.parse as urlparse
  12. async def fetch(session=None, url=None, headers=None, timeout=9, binary=False):
  13. _headers = {
  14. 'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; '
  15. 'Windows NT 6.1; Win64; x64; Trident/5.0)'),
  16. }
  17. _headers = headers if headers else _headers
  18. try:
  19. async with session.get(url, headers=_headers, timeout=timeout) as response:
  20. status_code = response.status
  21. html_bin_or_text = ''
  22. html_content = await response.read()
  23. if not binary:
  24. encoding = cchardet.detect(html_content)['encoding']
  25. html_bin_or_text = html_content.decode(encoding, errors='ignore')
  26. request_url = str(response.url)
  27. except Exception as e:
  28. msg = 'Failed download: {} | exception: {}, {}'.format(url, str(type(e)), str(e))
  29. print(msg)
  30. html_bin_or_text = ''
  31. status_code = -1
  32. request_url = url
  33. return status_code, html_bin_or_text, request_url
  34. def downloader(url=None, timeout=10, headers=None, debug=False, binary=False):
  35. _headers = {
  36. 'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; '
  37. 'Windows NT 6.1; Win64; x64; Trident/5.0)'),
  38. }
  39. _headers = headers if headers else _headers
  40. request_url = url
  41. try:
  42. r = requests.get(url, headers=_headers, timeout=timeout)
  43. if binary:
  44. html_bin_or_text = r.content
  45. else:
  46. encoding = cchardet.detect(r.content)['encoding']
  47. html_bin_or_text = r.content.decode(encoding, errors='ignore')
  48. status_code = r.status_code
  49. request_url = r.url
  50. except BaseException as be:
  51. if debug:
  52. traceback.print_exc()
  53. msg = 'failed download: {}'.format(url)
  54. print(msg)
  55. html_bin_or_text = b'' if binary else ''
  56. status_code = -1
  57. return status_code, html_bin_or_text, request_url
  58. g_bin_postfix = {
  59. 'exe', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx', 'pdf',
  60. 'jpg', 'png', 'bmp', 'jpeg', 'gif', 'zip', 'rar', 'tar',
  61. 'bz2', '7z', 'gz', 'flv', 'mp4', 'avi', 'wmv', 'mkv', 'apk'
  62. }
  63. g_news_postfix = ['.html?', '.htm?', '.shtml?', '.shtm?']
  64. def clean_url(url=None):
  65. # 1. 是否为合法的http url
  66. if not url.startswith('http'):
  67. return ''
  68. # 2. 去掉静态化url后面的参数
  69. for np in g_news_postfix:
  70. p = url.find(np)
  71. if p > -1:
  72. p = url.find('?')
  73. url = url[:p]
  74. return url
  75. # 3. 不下载二进制类内容的链接
  76. up = urlparse.urlparse(url)
  77. path = up.path
  78. if not path:
  79. path = '/'
  80. postfix = path.split('.')[-1].lower()
  81. if postfix in g_bin_postfix:
  82. return ''
  83. # 4. 去掉标识流量来源的参数
  84. # badquery = ['spm', 'utm_source', 'utm_source', 'utm_medium', 'utm_campaign']
  85. good_queries = []
  86. for query in up.query.split('&'):
  87. qv = query.split('=')
  88. if qv[0].startswith('spm') or qv[0].startswith('utm_'):
  89. continue
  90. if len(qv) == 1:
  91. continue
  92. good_queries.append(query)
  93. query = '&'.join(good_queries)
  94. url = urlparse.urlunparse((
  95. up.scheme,
  96. up.netloc,
  97. path,
  98. up.params,
  99. query,
  100. '' # crawler do not care fragment
  101. ))
  102. return url
  103. g_pattern_tag_a = re.compile(r'<a[^>]*?href=[\'"]?([^> \'"]+)[^>]*?>(.*?)</a>', re.I | re.S | re.M)
  104. def extract_links_re(url=None, html=None):
  105. """use re module to extract links from html"""
  106. news_links = set()
  107. tag_a_list = g_pattern_tag_a.findall(html)
  108. for tag_a in tag_a_list:
  109. link = tag_a[0].strip()
  110. if not link:
  111. continue
  112. link = urlparse.urljoin(url, link)
  113. link = clean_url(link)
  114. if not link:
  115. continue
  116. news_links.add(link)
  117. return news_links
  118. def init_file_logger(f_name=None):
  119. # config logging
  120. import logging
  121. from logging.handlers import TimedRotatingFileHandler
  122. ch = TimedRotatingFileHandler(f_name, when="midnight")
  123. ch.setLevel(logging.INFO)
  124. # create formatter
  125. fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
  126. formatter = logging.Formatter(fmt)
  127. # add formatter to ch
  128. ch.setFormatter(formatter)
  129. logger = logging.getLogger(f_name)
  130. # add ch to logger
  131. logger.addHandler(ch)
  132. return logger
  133. if __name__ == '__main__':
  134. temp_url = 'http://news.baidu.com/'
  135. t_status_code, t_html, t_url = downloader(url=temp_url)
  136. print(f'[{t_status_code}, {t_url}]:{len(t_html)}')

config.py

  1. db_host = 'localhost'
  2. db_db = 'crawler'
  3. db_user = 'your-user'
  4. db_password = 'your-password'

新闻爬虫 ( 同步 )

news_sync.py

  1. # -*- coding: utf-8 -*-
  2. # @Author : 佛祖保佑, 永无 bug
  3. # @Date :
  4. # @File : news_sync.py
  5. # @Software: PyCharm
  6. # @description : XXX
  7. import urllib.parse as urlparse
  8. import lzma
  9. import farmhash
  10. import traceback
  11. from ezpymysql import Connection
  12. from url_pool import UrlPool
  13. import functions as fn
  14. import config
  15. class NewsCrawlerSync:
  16. def __init__(self, name):
  17. self.db = Connection(
  18. config.db_host,
  19. config.db_db,
  20. config.db_user,
  21. config.db_password
  22. )
  23. self.logger = fn.init_file_logger(name + '.log')
  24. self.url_pool = UrlPool(name)
  25. self.hub_hosts = None
  26. self.load_hubs()
  27. def load_hubs(self,):
  28. sql = 'select url from crawler_hub'
  29. data = self.db.query(sql)
  30. self.hub_hosts = set()
  31. hubs = []
  32. for d in data:
  33. host = urlparse.urlparse(d['url']).netloc
  34. self.hub_hosts.add(host)
  35. hubs.append(d['url'])
  36. self.url_pool.set_hubs(hubs, 300)
  37. def save_to_db(self, url, html):
  38. url_hash = farmhash.hash64(url)
  39. sql = f'select url from crawler_html where urlhash={url_hash}'
  40. d = self.db.get(sql, url_hash)
  41. if d:
  42. if d['url'] != url:
  43. msg = 'farm_hash collision: %s <=> %s' % (url, d['url'])
  44. self.logger.error(msg)
  45. return True
  46. if isinstance(html, str):
  47. html = html.encode('utf8')
  48. html_lzma = lzma.compress(html)
  49. sql = 'insert into crawler_html(urlhash, url, html_lzma) values(%s, %s, %s)'
  50. good = False
  51. try:
  52. self.db.execute(sql, url_hash, url, html_lzma)
  53. good = True
  54. except Exception as e:
  55. if e.args[0] == 1062:
  56. # Duplicate entry
  57. good = True
  58. pass
  59. else:
  60. traceback.print_exc()
  61. raise e
  62. return good
  63. def filter_good(self, urls):
  64. good_links = []
  65. for url in urls:
  66. host = urlparse.urlparse(url).netloc
  67. if host in self.hub_hosts:
  68. good_links.append(url)
  69. return good_links
  70. def process(self, url, is_hub):
  71. status, html, redirected_url = fn.downloader(url)
  72. self.url_pool.set_status(url, status)
  73. if redirected_url != url:
  74. self.url_pool.set_status(redirected_url, status)
  75. # 提取hub网页中的链接, 新闻网页中也有“相关新闻”的链接,按需提取
  76. if status != 200:
  77. return
  78. if is_hub:
  79. new_links = fn.extract_links_re(redirected_url, html)
  80. good_links = self.filter_good(new_links)
  81. print(f"{len(good_links)} / {len(new_links)}, good_links/new_links")
  82. self.url_pool.add_many(good_links)
  83. else:
  84. self.save_to_db(redirected_url, html)
  85. def run(self,):
  86. while 1:
  87. urls = self.url_pool.pop(5)
  88. for url, is_hub in urls.items():
  89. self.process(url, is_hub)
  90. if __name__ == '__main__':
  91. crawler = NewsCrawlerSync('sync_spider')
  92. crawler.run()

新闻爬虫 ( 异步 )

news_async.py

  1. # -*- coding: utf-8 -*-
  2. # @Author : 佛祖保佑, 永无 bug
  3. # @Date :
  4. # @File : news_async.py
  5. # @Software: PyCharm
  6. # @description : XXX
  7. import traceback
  8. import time
  9. import asyncio
  10. import aiohttp
  11. import urllib.parse as urlparse
  12. import farmhash
  13. import lzma
  14. # import uvloop
  15. # asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
  16. import sanicdb
  17. from url_pool import UrlPool
  18. import functions as fn
  19. import config
  20. class NewsCrawlerAsync:
  21. def __init__(self, name):
  22. self.hub_hosts = set()
  23. self._workers = 0
  24. self._workers_max = 30
  25. self.logger = fn.init_file_logger(name + '.log')
  26. self.url_pool = UrlPool(name)
  27. self.loop = asyncio.get_event_loop()
  28. self.session = aiohttp.ClientSession(loop=self.loop)
  29. self.db = sanicdb.SanicDB(
  30. config.db_host,
  31. config.db_db,
  32. config.db_user,
  33. config.db_password,
  34. loop=self.loop
  35. )
  36. async def load_hubs(self, ):
  37. sql = 'select url from crawler_hub'
  38. data = await self.db.query(sql)
  39. hubs = []
  40. for d in data:
  41. host = urlparse.urlparse(d['url']).netloc
  42. self.hub_hosts.add(host)
  43. hubs.append(d['url'])
  44. self.url_pool.set_hubs(hubs, 300)
  45. async def save_to_db(self, url, html):
  46. url_hash = farmhash.hash64(url)
  47. sql = 'select url from crawler_html where urlhash=%s'
  48. d = await self.db.get(sql, url_hash)
  49. if d:
  50. if d['url'] != url:
  51. msg = 'farmhash collision: %s <=> %s' % (url, d['url'])
  52. self.logger.error(msg)
  53. return True
  54. if isinstance(html, str):
  55. html = html.encode('utf8')
  56. html_lzma = lzma.compress(html)
  57. sql = 'insert into crawler_html(urlhash, url, html_lzma) values(%s, %s, %s)'
  58. good = False
  59. try:
  60. await self.db.execute(sql, url_hash, url, html_lzma)
  61. good = True
  62. except Exception as e:
  63. if e.args[0] == 1062:
  64. # Duplicate entry
  65. good = True
  66. pass
  67. else:
  68. traceback.print_exc()
  69. raise e
  70. return good
  71. def filter_good(self, urls):
  72. good_links = []
  73. for url in urls:
  74. host = urlparse.urlparse(url).netloc
  75. if host in self.hub_hosts:
  76. good_links.append(url)
  77. return good_links
  78. async def process(self, url, is_hub):
  79. status, html, redirected_url = await fn.fetch(self.session, url)
  80. self.url_pool.set_status(url, status)
  81. if redirected_url != url:
  82. self.url_pool.set_status(redirected_url, status)
  83. # 提取hub网页中的链接, 新闻网页中也有“相关新闻”的链接,按需提取
  84. if status != 200:
  85. self._workers -= 1
  86. return
  87. if is_hub:
  88. new_links = fn.extract_links_re(redirected_url, html)
  89. good_links = self.filter_good(new_links)
  90. print(f"{len(good_links)} / {len(new_links)}, good_links / new_links")
  91. self.url_pool.add_many(good_links)
  92. else:
  93. await self.save_to_db(redirected_url, html)
  94. self._workers -= 1
  95. async def loop_crawl(self):
  96. await self.load_hubs()
  97. last_rating_time = time.time()
  98. counter = 0
  99. while 1:
  100. to_pop = self._workers_max - self._workers
  101. tasks = self.url_pool.pop(to_pop)
  102. if not tasks:
  103. print('no url to crawl, sleep')
  104. await asyncio.sleep(3)
  105. continue
  106. for url, is_hub in tasks.items():
  107. self._workers += 1
  108. counter += 1
  109. print('crawl:', url)
  110. asyncio.ensure_future(self.process(url, is_hub))
  111. gap = time.time() - last_rating_time
  112. if gap > 5:
  113. rate = counter / gap
  114. print(f'\tloop_crawl() rate:{round(rate, 2)}, counter: {counter}, workers: {self._workers}')
  115. last_rating_time = time.time()
  116. counter = 0
  117. if self._workers > self._workers_max:
  118. print('====== got workers_max, sleep 3 sec to next worker =====')
  119. await asyncio.sleep(3)
  120. def run(self):
  121. try:
  122. self.loop.run_until_complete(self.loop_crawl())
  123. except KeyboardInterrupt:
  124. print('stopped by yourself!')
  125. del self.url_pool
  126. pass
  127. if __name__ == '__main__':
  128. nc = NewsCrawlerAsync('async_spider')
  129. nc.run()

分布式爬虫 ( CS 模型 )

server.py

  1. # -*- coding: utf-8 -*-
  2. # @Author : 佛祖保佑, 永无 bug
  3. # @Date :
  4. # @File : server.py
  5. # @Software: PyCharm
  6. # @description : XXX
  7. from sanic import Sanic
  8. from sanic import response
  9. from my_url_pool import UrlPool
  10. url_pool = UrlPool(__file__)
  11. # 初始化 url_pool,根据你的需要进行修改
  12. hub_urls = []
  13. url_pool.set_hubs(hub_urls, 300)
  14. url_pool.add('https://news.sina.com.cn/')
  15. # init
  16. main_app = Sanic(__name__)
  17. @main_app.listener('after_server_stop')
  18. async def cache_url_pool(app=None, loop=None):
  19. global url_pool
  20. print('caching url_pool after_server_stop')
  21. del url_pool
  22. print('bye!')
  23. @main_app.route('/task')
  24. async def task_get(request=None):
  25. count = request.args.get('count', 10)
  26. try:
  27. count = int(count)
  28. except BaseException as be:
  29. count = 10
  30. urls = url_pool.pop(count)
  31. return response.json(urls)
  32. @main_app.route('/task', methods=['POST', ])
  33. async def task_post(request=None):
  34. result = request.json
  35. url_pool.set_status(result['url'], result['status'])
  36. if result['url_real'] != result['url']:
  37. url_pool.set_status(result['url_real'], result['status'])
  38. if result['new_urls']:
  39. print('receive URLs:', len(result['new_urls']))
  40. for url in result['new_urls']:
  41. url_pool.add(url)
  42. return response.text('ok')
  43. if __name__ == '__main__':
  44. main_app.run(host='0.0.0.0', port=8080, debug=False, access_log=False, workers=1)
  45. pass

client.py

  1. # -*- coding: utf-8 -*-
  2. # @Author : 佛祖保佑, 永无 bug
  3. # @Date :
  4. # @File : client.py
  5. # @Software: PyCharm
  6. # @description : XXX
  7. import re
  8. import cchardet
  9. import traceback
  10. import time
  11. import json
  12. import asyncio
  13. import urllib.parse as urlparse
  14. import aiohttp
  15. # import uvloop
  16. # asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
  17. p_tag_a = re.compile(r'<a[^>]*?href=[\'"]?([^> \'"]+)[^>]*?>(.*?)</a>', re.I | re.S | re.M)
  18. def extract_links_re(url, html):
  19. new_links = set()
  20. aa = p_tag_a.findall(html)
  21. for a in aa:
  22. link = a[0].strip()
  23. if not link:
  24. continue
  25. link = urlparse.urljoin(url, link)
  26. if not link.startswith('http'):
  27. continue
  28. new_links.add(link)
  29. return new_links
  30. class CrawlerClient:
  31. def __init__(self, ):
  32. self._workers = 0
  33. self.workers_max = 20
  34. self.server_host = 'localhost'
  35. self.server_port = 8080
  36. self.headers = {'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; '
  37. 'Windows NT 6.1; Win64; x64; Trident/5.0)')}
  38. self.loop = asyncio.get_event_loop()
  39. self.queue = asyncio.Queue(loop=self.loop)
  40. self.session = aiohttp.ClientSession(loop=self.loop)
  41. async def download(self, url, timeout=25):
  42. status_code = 900
  43. html = ''
  44. url_now = url
  45. try:
  46. async with self.session.get(url_now, headers=self.headers, timeout=timeout) as response:
  47. status_code = response.status
  48. html = await response.read()
  49. encoding = cchardet.detect(html)['encoding']
  50. html = html.decode(encoding, errors='ignore')
  51. url_now = str(response.url)
  52. except BaseException as be:
  53. # traceback.print_exc()
  54. print('=== exception: ', be, type(be), str(be))
  55. msg = 'Failed download: {} | exception: {}, {}'.format(url, str(type(be)), str(be))
  56. print(msg)
  57. return status_code, html, url_now
  58. async def get_urls(self, ):
  59. count = self.workers_max - self.queue.qsize()
  60. if count <= 0:
  61. print('no need to get urls this time')
  62. return None
  63. url = f'http://{self.server_host}:{self.server_port}/task?count={count}'
  64. try:
  65. async with self.session.get(url, timeout=3) as response:
  66. if response.status not in [200, 201]:
  67. return
  68. jsn = await response.text()
  69. urls = json.loads(jsn)
  70. msg = f'get_urls() to get [{count}] but got[{len(urls)}], @{time.strftime("%Y-%m-%d %H:%M:%S")}'
  71. print(msg)
  72. for kv in urls.items():
  73. await self.queue.put(kv)
  74. print('queue size:', self.queue.qsize(), ', _workers:', self._workers)
  75. except BaseException as be:
  76. traceback.print_exc()
  77. return
  78. async def send_result(self, result):
  79. url = f'http://{self.server_host}:{self.server_port}/task'
  80. try:
  81. async with self.session.post(url, json=result, timeout=3) as response:
  82. return response.status
  83. except BaseException as be:
  84. traceback.print_exc()
  85. pass
  86. @staticmethod
  87. def save_html(url, html):
  88. print('saved:', url, len(html))
  89. @staticmethod
  90. def filter_good(urls):
  91. """根据抓取目的过滤提取的URLs,只要你想要的"""
  92. good = []
  93. for url in urls:
  94. if url.startswith('http'):
  95. good.append(url)
  96. return good
  97. async def process(self, url, is_hub):
  98. status, html, url_now = await self.download(url)
  99. self._workers -= 1
  100. print('downloaded:', url, ', html:', len(html))
  101. if html:
  102. new_urls = extract_links_re(url, html)
  103. new_urls = self.filter_good(new_urls)
  104. self.save_html(url, html)
  105. else:
  106. new_urls = []
  107. result = {
  108. 'url': url,
  109. 'url_real': url_now,
  110. 'status': status,
  111. 'new_urls': new_urls,
  112. }
  113. await self.send_result(result)
  114. async def loop_get_urls(self, ):
  115. print('loop_get_urls() start')
  116. while 1:
  117. await self.get_urls()
  118. await asyncio.sleep(1)
  119. async def loop_crawl(self, ):
  120. print('loop_crawl() start')
  121. asyncio.ensure_future(self.loop_get_urls())
  122. counter = 0
  123. while 1:
  124. item = await self.queue.get()
  125. url, url_level = item
  126. self._workers += 1
  127. counter += 1
  128. asyncio.ensure_future(self.process(url, url_level))
  129. if self._workers > self.workers_max:
  130. print('====== got workers_max, sleep 3 sec to next worker =====')
  131. await asyncio.sleep(3)
  132. def start(self):
  133. try:
  134. self.loop.run_until_complete(self.loop_crawl())
  135. except KeyboardInterrupt:
  136. print('stopped by yourself!')
  137. pass
  138. def run():
  139. ant = CrawlerClient()
  140. ant.start()
  141. if __name__ == '__main__':
  142. run()

google 翻译

google 翻译:Google 翻译

  1. # -*- coding: utf-8 -*-
  2. # @Author : 佛祖保佑, 永无 bug
  3. # @Date :
  4. # @File : translate_google.py
  5. # @Software: PyCharm
  6. # @description : XXX
  7. import requests
  8. import urllib3
  9. urllib3.disable_warnings()
  10. def test(kw=None):
  11. url = 'https://translate.google.cn/_/TranslateWebserverUi/data/batchexecute?rpcids=MkEWBc&hl=zh-CN'
  12. custom_headers = {
  13. 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8'
  14. }
  15. # kw = 'this is a test'
  16. payload = f"f.req=[[[\"MkEWBc\",\"[[\\\"{kw}\\\",\\\"auto\\\",\\\"zh-CN\\\",true],[null]]\",null,\"generic\"]]]&"
  17. resp = requests.post(url, data=payload, headers=custom_headers)
  18. print(resp.status_code)
  19. print(resp.text)
  20. pass
  21. if __name__ == '__main__':
  22. kw_list = [
  23. 'I love u', 'hello, baby', 'king',
  24. 'this is a test'
  25. ]
  26. for item in kw_list:
  27. test(kw=item)
  28. pass

百度 翻译

百度翻译:百度翻译-200种语言互译、沟通全世界!

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2ZyZWVraW5nMTAx_size_16_color_FFFFFF_t_70

查看 basetrans 请求。

( 多请求几次,可以发现 sign 每次都不一样,所以需要 逆向 sign )

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2ZyZWVraW5nMTAx_size_16_color_FFFFFF_t_70 1

使用 postman 来精简请求参数,看那些参数是必须的,那些参数是可以直接删除的。精简后参数

  • 请求URL:https://fanyi.baidu.com/basetrans
  • 请求头:Content-Type: application/x-www-form-urlencoded
    Cookie: BAIDUID=AF87393A8DB7C8FED7859A909FF081A3:SL=0:NR=50:FG=1;
    User-Agent: Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Mobile Safari/537.36
  • 请求 body:query=king&from=en&to=zh&token=94c55bca8b920035077b58d58ba32bea&sign=612765.899756

postman 生成的 Python 代码:

  1. import requests
  2. url = "https://fanyi.baidu.com/basetrans"
  3. payload = "query=king&from=en&to=zh&token=94c55bca8b920035077b58d58ba32bea&sign=612765.899756"
  4. headers = {
  5. 'Content-Type': 'application/x-www-form-urlencoded',
  6. 'Cookie': 'BAIDUID=AF87393A8DB7C8FED7859A909FF081A3:SL=0:NR=50:FG=1;',
  7. 'User-Agent': 'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) '
  8. 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Mobile Safari/537.36'
  9. }
  10. response = requests.request("POST", url, headers=headers, data=payload)
  11. print(response.text)

JS 断点调试

20210323012330206.png

打断点,然后通过调用堆栈追踪 sign 是怎么生成的。。。

因为 多个 ajax 请求都会走 b.send(e.data ? e.data:null) 这个函数,所以需要点击好几次 “跳转到下一个断点“ 才能看到 sign 值。

如果想直接断点就能看到 sign 值,则可以添加 “URL包含“ 断点。

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2ZyZWVraW5nMTAx_size_16_color_FFFFFF_t_70 2

这里使用 “跳转到下一个断点“ 来追踪 sign 值。

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2ZyZWVraW5nMTAx_size_16_color_FFFFFF_t_70 3

生成 sign 的函数

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2ZyZWVraW5nMTAx_size_16_color_FFFFFF_t_70 4

点击 P(e) 函数,查看函数实现

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2ZyZWVraW5nMTAx_size_16_color_FFFFFF_t_70 5

方法 1:直接使用 Python 实现这个函数的逻辑

方法 2:把 js 代码扣出来,直接 Python 执行

代码太多,这里直接 扣 js 代码

  1. function n(r, o) {
  2. for (var t = 0; t < o.length - 2; t += 3) {
  3. var e = o.charAt(t + 2);
  4. e = e >= "a" ? e.charCodeAt(0) - 87 : Number(e),
  5. e = "+" === o.charAt(t + 1) ? r >>> e : r << e,
  6. r = "+" === o.charAt(t) ? r + e & 4294967295 : r ^ e
  7. }
  8. return r
  9. }
  10. function sign(r) {
  11. var t = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
  12. if (null === t) {
  13. var a = r.length;
  14. a > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(a / 2) - 5, 10) + r.substr(-10, 10))
  15. } else {
  16. for (var C = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), h = 0, f = C.length, u = []; f > h; h++)
  17. "" !== C[h] && u.push.apply(u, e(C[h].split(""))),
  18. h !== f - 1 && u.push(t[h]);
  19. var g = u.length;
  20. g > 30 && (r = u.slice(0, 10).join("") + u.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + u.slice(-10).join(""))
  21. }
  22. var l = void 0
  23. , d = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
  24. // l = null !== i ? i : (i = o.common[d] || "") || "";
  25. l = "320305.131321201";
  26. for (var m = l.split("."), S = Number(m[0]) || 0, s = Number(m[1]) || 0, c = [], v = 0, F = 0; F < r.length; F++) {
  27. var p = r.charCodeAt(F);
  28. 128 > p ? c[v++] = p : (2048 > p ? c[v++] = p >> 6 | 192 : (55296 === (64512 & p) && F + 1 < r.length && 56320 === (64512 & r.charCodeAt(F + 1)) ? (p = 65536 + ((1023 & p) << 10) + (1023 & r.charCodeAt(++F)),
  29. c[v++] = p >> 18 | 240,
  30. c[v++] = p >> 12 & 63 | 128) : c[v++] = p >> 12 | 224,
  31. c[v++] = p >> 6 & 63 | 128),
  32. c[v++] = 63 & p | 128)
  33. }
  34. for (var w = S, A = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), b = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), D = 0; D < c.length; D++)
  35. w += c[D], w = n(w, A);
  36. return w = n(w, b),
  37. w ^= s,
  38. 0 > w && (w = (2147483647 & w) + 2147483648),
  39. w %= 1e6,
  40. w.toString() + "." + (w ^ S)
  41. }
  42. console.log(sign('king'))

执行结果:

20210323005126375.png

也可以直接在 Chrome 上的 console 中执行 js

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2ZyZWVraW5nMTAx_size_16_color_FFFFFF_t_70 6

Python 直接调用 JS 代码( js_code = r”””js代码””” 中 r 不能少 )

// l = null !== i ? i : (i = o.common[d] || “”) || “”; // l 的值 等于 gkt,通过调试可知,这是个固定值
l = “320305.131321201”; // 直接 令 l = “320305.131321201”;

  1. # @Author : 佛祖保佑, 永无 bug
  2. # @Date :
  3. # @File : translate_baidu.py
  4. # @Software: PyCharm
  5. # @description : XXX
  6. import execjs
  7. import requests
  8. js_code = r"""
  9. function n(r, o) {
  10. for (var t = 0; t < o.length - 2; t += 3) {
  11. var e = o.charAt(t + 2);
  12. e = e >= "a" ? e.charCodeAt(0) - 87 : Number(e),
  13. e = "+" === o.charAt(t + 1) ? r >>> e : r << e,
  14. r = "+" === o.charAt(t) ? r + e & 4294967295 : r ^ e
  15. }
  16. return r
  17. }
  18. function sign(r) {
  19. var t = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
  20. if (null === t) {
  21. var a = r.length;
  22. a > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(a / 2) - 5, 10) + r.substr(-10, 10))
  23. } else {
  24. for (var C = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), h = 0, f = C.length, u = []; f > h; h++)
  25. "" !== C[h] && u.push.apply(u, e(C[h].split(""))),
  26. h !== f - 1 && u.push(t[h]);
  27. var g = u.length;
  28. g > 30 && (r = u.slice(0, 10).join("") + u.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + u.slice(-10).join(""))
  29. }
  30. var l = void 0
  31. , d = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
  32. // l = null !== i ? i : (i = o.common[d] || "") || "";
  33. l = "320305.131321201";
  34. for (var m = l.split("."), S = Number(m[0]) || 0, s = Number(m[1]) || 0, c = [], v = 0, F = 0; F < r.length; F++) {
  35. var p = r.charCodeAt(F);
  36. 128 > p ? c[v++] = p : (2048 > p ? c[v++] = p >> 6 | 192 : (55296 === (64512 & p) && F + 1 < r.length && 56320 === (64512 & r.charCodeAt(F + 1)) ? (p = 65536 + ((1023 & p) << 10) + (1023 & r.charCodeAt(++F)),
  37. c[v++] = p >> 18 | 240,
  38. c[v++] = p >> 12 & 63 | 128) : c[v++] = p >> 12 | 224,
  39. c[v++] = p >> 6 & 63 | 128),
  40. c[v++] = 63 & p | 128)
  41. }
  42. for (var w = S, A = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), b = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), D = 0; D < c.length; D++)
  43. w += c[D], w = n(w, A);
  44. return w = n(w, b),
  45. w ^= s,
  46. 0 > w && (w = (2147483647 & w) + 2147483648),
  47. w %= 1e6,
  48. w.toString() + "." + (w ^ S)
  49. }
  50. console.log(sign('king'))
  51. """
  52. js_func = execjs.compile(js_code)
  53. def test(kw=None):
  54. url = "https://fanyi.baidu.com/basetrans"
  55. sign = js_func.call('sign', kw)
  56. payload = f"query={kw}&from=en&to=zh&token=94c55bca8b920035077b58d58ba32bea&sign={sign}"
  57. headers = {
  58. 'Content-Type': 'application/x-www-form-urlencoded',
  59. 'Cookie': 'BAIDUID=AF87393A8DB7C8FED7859A909FF081A3:SL=0:NR=50:FG=1;',
  60. 'User-Agent': 'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) '
  61. 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Mobile Safari/537.36'
  62. }
  63. response = requests.request("POST", url, headers=headers, data=payload)
  64. # print(response.text)
  65. print(response.json()['trans'][0]['dst'])
  66. if __name__ == '__main__':
  67. kw_list = [
  68. 'hello baby', 'I love u',
  69. 'king'
  70. ]
  71. for item in kw_list:
  72. test(kw=item)
  73. pass

执行结果:

2021032301173841.png

有道 翻译

参考:爬虫破解js加密破解(二) 有道词典js加密参数 sign破解 - 淋哥 - 博客园

JS逆向 —— 百度翻译参数(sign)爬虫 超级详细:【2019.05】JS逆向——破解百度翻译参数(sign)爬虫 超级详细_ybsde博客-CSDN博客_js逆向

Python 执行 js 代码

关于 Python 执行 js 代码

    1. 通过 python 模块 execjs 来解析 js 文件 (安装: pip install PyExecJS
    1. python 调用 node.js(需要先安装 node.js)
    1. 使用 PyV8 在 Python 爬虫中执行 js 代码:python pyv8_百度搜索
    1. 使用 浏览器( selenium、PhantomJS) 执行 js:网络爬虫之记一次js逆向解密经历 - 奥辰 - 博客园

PyV8、PyExecJS、js2py 区别:PyV8、PyExecJS、js2py区别 - 简书

PyV8 github 地址:https://github.com/emmetio/pyv8-binaries

Python 使用 execjs 示例:

python 的 execjs 只支持一些常规的 js 代码 :python 使用execjs执行接js解密时报错execjs UnicodeDecodeError: ‘gbk’ codec can’t decode byte_zhaojiafu的博客-CSDN博客

  1. import execjs
  2. js_str = '''
  3. function add(x, y){
  4. return x + y;
  5. }
  6. '''
  7. test = execjs.compile(js_str)
  8. # call 即调用js函数,add 为 js_str 中的函数名,1,2 为所需要的参数。
  9. result = test.call('add', 1, 2)
  10. print(result)

有道词典 js 加密参数 sign破解

分析:

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2ZyZWVraW5nMTAx_size_16_color_FFFFFF_t_70 7

然后转到 Headers,查看 请求体,可以看到有请求验证字段,如:salt,sign 等 :

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2ZyZWVraW5nMTAx_size_16_color_FFFFFF_t_70 8

Ctrl + Shift + F : 全局搜索 sign 这个关键字,搜索所有 包含 sign 的 js 文件,发现只搜索出一个,双击这个 js 文件:

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2ZyZWVraW5nMTAx_size_16_color_FFFFFF_t_70 9

在 js 文件内容中 搜索 sign ,分析 sign 怎么产生的:

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2ZyZWVraW5nMTAx_size_16_color_FFFFFF_t_70 10

提取 js 的 加密函数( 这里是 md5 );

通过 js 调试,可以找到 md5 函数:

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2ZyZWVraW5nMTAx_size_16_color_FFFFFF_t_70 11

把 md5 函数提取出来,放到一个单独的 js 文件里面,然后,再通过 js调试 把 md5 函数所依赖的 js 函数全部 提取出来。

目录结构:

20190731200056925.png

提取 的 所有 js 函数如下( yd_js.js)

  1. function n(e, t) {
  2. return e << t | e >>> 32 - t
  3. }
  4. function r(e, t) {
  5. var n, r, i, o, a;
  6. return i = 2147483648 & e,
  7. o = 2147483648 & t,
  8. n = 1073741824 & e,
  9. r = 1073741824 & t,
  10. a = (1073741823 & e) + (1073741823 & t),
  11. n & r ? 2147483648 ^ a ^ i ^ o : n | r ? 1073741824 & a ? 3221225472 ^ a ^ i ^ o : 1073741824 ^ a ^ i ^ o : a ^ i ^ o
  12. }
  13. function i(e, t, n) {
  14. return e & t | ~e & n
  15. }
  16. function o(e, t, n) {
  17. return e & n | t & ~n
  18. }
  19. function a(e, t, n) {
  20. return e ^ t ^ n
  21. }
  22. function s(e, t, n) {
  23. return t ^ (e | ~n)
  24. }
  25. function l(e, t, o, a, s, l, c) {
  26. return e = r(e, r(r(i(t, o, a), s), c)),
  27. r(n(e, l), t)
  28. }
  29. function c(e, t, i, a, s, l, c) {
  30. return e = r(e, r(r(o(t, i, a), s), c)),
  31. r(n(e, l), t)
  32. }
  33. function u(e, t, i, o, s, l, c) {
  34. return e = r(e, r(r(a(t, i, o), s), c)),
  35. r(n(e, l), t)
  36. }
  37. function f(e, t, i, o, a, l, c) {
  38. return e = r(e, r(r(s(t, i, o), a), c)),
  39. r(n(e, l), t)
  40. }
  41. function d(e) {
  42. for (var t, n = e.length, r = n + 8, i = 16 * ((r - r % 64) / 64 + 1), o = Array(i - 1), a = 0, s = 0; s < n;)
  43. a = s % 4 * 8,
  44. o[t = (s - s % 4) / 4] = o[t] | e.charCodeAt(s) << a,
  45. s++;
  46. return t = (s - s % 4) / 4,
  47. a = s % 4 * 8,
  48. o[t] = o[t] | 128 << a,
  49. o[i - 2] = n << 3,
  50. o[i - 1] = n >>> 29,
  51. o
  52. }
  53. function p(e) {
  54. var t, n = "", r = "";
  55. for (t = 0; t <= 3; t++)
  56. n += (r = "0" + (e >>> 8 * t & 255).toString(16)).substr(r.length - 2, 2);
  57. return n
  58. }
  59. function h(e) {
  60. e = e.replace(/\x0d\x0a/g, "\n");
  61. for (var t = "", n = 0; n < e.length; n++) {
  62. var r = e.charCodeAt(n);
  63. if (r < 128)
  64. t += String.fromCharCode(r);
  65. else if (r > 127 && r < 2048)
  66. t += String.fromCharCode(r >> 6 | 192),
  67. t += String.fromCharCode(63 & r | 128);
  68. else if (r >= 55296 && r <= 56319) {
  69. if (n + 1 < e.length) {
  70. var i = e.charCodeAt(n + 1);
  71. if (i >= 56320 && i <= 57343) {
  72. var o = 1024 * (r - 55296) + (i - 56320) + 65536;
  73. t += String.fromCharCode(240 | o >> 18 & 7),
  74. t += String.fromCharCode(128 | o >> 12 & 63),
  75. t += String.fromCharCode(128 | o >> 6 & 63),
  76. t += String.fromCharCode(128 | 63 & o),
  77. n++
  78. }
  79. }
  80. } else
  81. t += String.fromCharCode(r >> 12 | 224),
  82. t += String.fromCharCode(r >> 6 & 63 | 128),
  83. t += String.fromCharCode(63 & r | 128)
  84. }
  85. return t;
  86. }
  87. function md5(e) {
  88. var t, n, i, o, a, s, m, g, v, y = Array();
  89. for (
  90. e = h(e),
  91. y = d(e),
  92. s = 1732584193,
  93. m = 4023233417,
  94. g = 2562383102,
  95. v = 271733878,
  96. t = 0; t < y.length; t += 16
  97. )
  98. n = s,
  99. i = m,
  100. o = g,
  101. a = v,
  102. s = l(s, m, g, v, y[t + 0], 7, 3614090360),
  103. v = l(v, s, m, g, y[t + 1], 12, 3905402710),
  104. g = l(g, v, s, m, y[t + 2], 17, 606105819),
  105. m = l(m, g, v, s, y[t + 3], 22, 3250441966),
  106. s = l(s, m, g, v, y[t + 4], 7, 4118548399),
  107. v = l(v, s, m, g, y[t + 5], 12, 1200080426),
  108. g = l(g, v, s, m, y[t + 6], 17, 2821735955),
  109. m = l(m, g, v, s, y[t + 7], 22, 4249261313),
  110. s = l(s, m, g, v, y[t + 8], 7, 1770035416),
  111. v = l(v, s, m, g, y[t + 9], 12, 2336552879),
  112. g = l(g, v, s, m, y[t + 10], 17, 4294925233),
  113. m = l(m, g, v, s, y[t + 11], 22, 2304563134),
  114. s = l(s, m, g, v, y[t + 12], 7, 1804603682),
  115. v = l(v, s, m, g, y[t + 13], 12, 4254626195),
  116. g = l(g, v, s, m, y[t + 14], 17, 2792965006),
  117. m = l(m, g, v, s, y[t + 15], 22, 1236535329),
  118. s = c(s, m, g, v, y[t + 1], 5, 4129170786),
  119. v = c(v, s, m, g, y[t + 6], 9, 3225465664),
  120. g = c(g, v, s, m, y[t + 11], 14, 643717713),
  121. m = c(m, g, v, s, y[t + 0], 20, 3921069994),
  122. s = c(s, m, g, v, y[t + 5], 5, 3593408605),
  123. v = c(v, s, m, g, y[t + 10], 9, 38016083),
  124. g = c(g, v, s, m, y[t + 15], 14, 3634488961),
  125. m = c(m, g, v, s, y[t + 4], 20, 3889429448),
  126. s = c(s, m, g, v, y[t + 9], 5, 568446438),
  127. v = c(v, s, m, g, y[t + 14], 9, 3275163606),
  128. g = c(g, v, s, m, y[t + 3], 14, 4107603335),
  129. m = c(m, g, v, s, y[t + 8], 20, 1163531501),
  130. s = c(s, m, g, v, y[t + 13], 5, 2850285829),
  131. v = c(v, s, m, g, y[t + 2], 9, 4243563512),
  132. g = c(g, v, s, m, y[t + 7], 14, 1735328473),
  133. m = c(m, g, v, s, y[t + 12], 20, 2368359562),
  134. s = u(s, m, g, v, y[t + 5], 4, 4294588738),
  135. v = u(v, s, m, g, y[t + 8], 11, 2272392833),
  136. g = u(g, v, s, m, y[t + 11], 16, 1839030562),
  137. m = u(m, g, v, s, y[t + 14], 23, 4259657740),
  138. s = u(s, m, g, v, y[t + 1], 4, 2763975236),
  139. v = u(v, s, m, g, y[t + 4], 11, 1272893353),
  140. g = u(g, v, s, m, y[t + 7], 16, 4139469664),
  141. m = u(m, g, v, s, y[t + 10], 23, 3200236656),
  142. s = u(s, m, g, v, y[t + 13], 4, 681279174),
  143. v = u(v, s, m, g, y[t + 0], 11, 3936430074),
  144. g = u(g, v, s, m, y[t + 3], 16, 3572445317),
  145. m = u(m, g, v, s, y[t + 6], 23, 76029189),
  146. s = u(s, m, g, v, y[t + 9], 4, 3654602809),
  147. v = u(v, s, m, g, y[t + 12], 11, 3873151461),
  148. g = u(g, v, s, m, y[t + 15], 16, 530742520),
  149. m = u(m, g, v, s, y[t + 2], 23, 3299628645),
  150. s = f(s, m, g, v, y[t + 0], 6, 4096336452),
  151. v = f(v, s, m, g, y[t + 7], 10, 1126891415),
  152. g = f(g, v, s, m, y[t + 14], 15, 2878612391),
  153. m = f(m, g, v, s, y[t + 5], 21, 4237533241),
  154. s = f(s, m, g, v, y[t + 12], 6, 1700485571),
  155. v = f(v, s, m, g, y[t + 3], 10, 2399980690),
  156. g = f(g, v, s, m, y[t + 10], 15, 4293915773),
  157. m = f(m, g, v, s, y[t + 1], 21, 2240044497),
  158. s = f(s, m, g, v, y[t + 8], 6, 1873313359),
  159. v = f(v, s, m, g, y[t + 15], 10, 4264355552),
  160. g = f(g, v, s, m, y[t + 6], 15, 2734768916),
  161. m = f(m, g, v, s, y[t + 13], 21, 1309151649),
  162. s = f(s, m, g, v, y[t + 4], 6, 4149444226),
  163. v = f(v, s, m, g, y[t + 11], 10, 3174756917),
  164. g = f(g, v, s, m, y[t + 2], 15, 718787259),
  165. m = f(m, g, v, s, y[t + 9], 21, 3951481745),
  166. s = r(s, n),
  167. m = r(m, i),
  168. g = r(g, o),
  169. v = r(v, a);
  170. return (p(s) + p(m) + p(g) + p(v)).toLowerCase()
  171. }
  172. // t = (new Date).getTime() + parseInt(10 * Math.random(), 10);
  173. // console.log((new Date).getTime());
  174. // console.log(t);

python 代码 ( yd.py ):

  1. import execjs
  2. import time
  3. import random
  4. import requests
  5. import json
  6. """
  7. 通过在js文件中查找salt或者sign,可以找到
  8. 1.可以找到这个计算salt的公式
  9. r = "" + ((new Date).getTime() + parseInt(10 * Math.random(), 10))
  10. 2.sign:n.md5("fanyideskweb" + t + r + "p09@Bn{h02_BIEe]$P^nG");
  11. md5 一共需要四个参数,第一个和第四个都是固定值得字符串,第三个是所谓的salt,
  12. 第二个参数是输入的需要翻译的单词
  13. """
  14. def get_md5(v):
  15. # 读取js文件
  16. with open('yd_js.js', encoding='utf-8') as f:
  17. js = f.read()
  18. # 通过compile命令转成一个js对象
  19. js_obj = execjs.compile(js)
  20. res = js_obj.call('md5', v)
  21. return res
  22. def get_sign(key, salt):
  23. sign = "fanyideskweb" + str(key) + str(salt) + "n%A-rKaT5fb[Gy?;N5@Tj"
  24. sign = get_md5(sign)
  25. return sign
  26. def you_dao(key):
  27. url = "http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule"
  28. ts = str(int((time.time() * 1000)))
  29. salt = str(ts) + str(random.randint(0, 10))
  30. data = {
  31. "i": key,
  32. "from": "AUTO",
  33. "to": "AUTO",
  34. "smartresult": "dict",
  35. "client": "fanyideskweb",
  36. "salt": str(salt),
  37. "sign": get_sign(key, salt),
  38. "ts": ts,
  39. "bv": "5872543b025b19167cde3785ecf1e925",
  40. "doctype": "json",
  41. "version": "2.1",
  42. "keyfrom": "fanyi.web",
  43. "action": "FY_BY_REALTIME",
  44. "typoResult": "false",
  45. }
  46. headers = {
  47. "Host": "fanyi.youdao.com",
  48. # "Proxy-Connection":"keep-alive",
  49. "Content-Length": str(len(data)),
  50. "Accept": "application/json, text/javascript, */*; q=0.01",
  51. "Origin": "http://fanyi.youdao.com",
  52. "X-Requested-With": "XMLHttpRequest",
  53. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
  54. "Chrome/75.0.3770.90 Safari/537.36",
  55. "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
  56. "Referer": "http://fanyi.youdao.com/",
  57. "Accept-Encoding": "gzip, deflate",
  58. "Accept-Language": "zh-CN,zh;q=0.9",
  59. "Cookie": "OUTFOX_SEARCH_USER_ID=-803834638@61.149.7.46; "
  60. "JSESSIONID=aaa2KqSwX9shJdA5Mk9Ww; OUTFOX_SEARCH_USER_ID_NCOO=1481235354.231604; "
  61. "___rl__test__cookies=1564486753731",
  62. }
  63. r = requests.post(url=url, data=data, headers=headers)
  64. if r.status_code == 200:
  65. data = json.loads(r.text)
  66. src = data.get('translateResult')[0][0]['src']
  67. tgt = data.get('translateResult')[0][0]['tgt']
  68. print(f'翻译前 :{src}')
  69. print(f'翻译后 :{tgt}')
  70. else:
  71. print(f'请求失败 status code {r.status_code}')
  72. if __name__ == '__main__':
  73. you_dao("hello, baby")
  74. you_dao("I love you")
  75. you_dao("thank you very much")
  76. you_dao("天王盖地虎,宝塔镇河妖")

运行结果截图:

watermark_type_ZmFuZ3poZW5naGVpdGk_shadow_10_text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2ZyZWVraW5nMTAx_size_16_color_FFFFFF_t_70 12

百度指数 js 破解

分析过程参考:爬虫之js加密参数破解练习-百度指数爬虫(附完整源码)_wang785994599的博客-CSDN博客_百度指数加密

Python爬虫 - 简单抓取百度指数:Python爬虫 - 简单抓取百度指数 - 知乎

Python 代码实现( 直接从浏览器拿到登录后的Cookie复制粘贴到代码中 ):

  1. # -*- coding: utf-8 -*-
  2. import requests
  3. import execjs
  4. import urllib3
  5. # 禁用警告
  6. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  7. js_string = '''
  8. function decrypt(t, e) {
  9. for (var n = t.split(""), i = e.split(""), a = {}, r = [], o = 0; o < n.length / 2; o++)
  10. a[n[o]] = n[n.length / 2 + o];
  11. for (var s = 0; s < e.length; s++)
  12. r.push(a[i[s]]);
  13. return r.join("")
  14. }
  15. '''
  16. headers = {
  17. "Cookie": "直接从浏览器拿到登录后的Cookie复制粘贴",
  18. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
  19. "Chrome/75.0.3770.142 Safari/537.36"
  20. }
  21. data_url = 'https://index.baidu.com/api/SearchApi/index?word={}&area=0&days=7'
  22. uniq_id_url = 'https://index.baidu.com/Interface/ptbk?uniqid={}'
  23. keys = ["all", "pc", "wise"]
  24. class BDIndex(object):
  25. def __init__(self):
  26. self.session = self.get_session()
  27. pass
  28. @staticmethod
  29. def get_session():
  30. """
  31. 初始化 session 会话
  32. :return:
  33. """
  34. session = requests.session()
  35. session.headers = headers
  36. session.verify = False
  37. return session
  38. @staticmethod
  39. def decrypt(key, data):
  40. """
  41. 得到解密后的数据
  42. :param key: key
  43. :param data: key 对应的 value
  44. :return:
  45. """
  46. js_handler = execjs.compile(js_string)
  47. return js_handler.call('decrypt', key, data)
  48. def get_bd_index(self, key_word):
  49. """
  50. 得到百度指数
  51. :param key_word:
  52. :return:
  53. """
  54. response = self.session.get(data_url.format(key_word)).json()
  55. uniq_id = self.session.get(
  56. uniq_id_url.format(response.get("data").get("uniqid"))
  57. ).json().get("data")
  58. result = []
  59. data_dict = response.get("data").get("userIndexes")[0]
  60. for key in keys:
  61. decrypt_data = self.decrypt(uniq_id, data_dict.get(key).get("data"))
  62. result.append({key: decrypt_data})
  63. return result
  64. if __name__ == '__main__':
  65. bd = BDIndex()
  66. d = bd.get_bd_index("杨幂")
  67. print(d)

运行结果:

20191120145113428.png

发表评论

表情:
评论列表 (有 0 条评论,432人围观)

还没有评论,来说两句吧...

相关阅读

    相关 翻译调用

    [转载自][Link 1] 百度翻译开放平台:[点击打开链接][Link 2] 1. 定义类用于保存解析json得到的结果 ![复制代码][copycode.gif]

    相关 通过爬虫使用翻译

      在NLP任务中,通过数据增强(例如EDA、回译等)的方式增加训练集是一个非常常用的trick。其中回译就需要用到翻译系统。但由于构建翻译系统需要大规模的语料和算力,所以不如