【youtubo】爬虫数据采集案例

朴灿烈づ我的快乐病毒、 2024-04-17 11:27 163阅读 0赞

" class="reference-link">从此烟雨落金城,一人撑伞两人行在这里插入图片描述

案例展示

  1. # -*- coding:utf-8 -*-
  2. # @software: PyCharm
  3. # desc:
  4. import datetime
  5. import json
  6. from loguru import logger as logging
  7. import re
  8. from json import load, dumps
  9. from os import path
  10. from re import findall
  11. import pymongo
  12. import requests
  13. import scrapy
  14. from pymongo.errors import DuplicateKeyError
  15. cwd = path.dirname(path.abspath(__file__))
  16. class VideoError(Exception):
  17. def __init__(self, vid):
  18. self.message = f'Invalid video ID. Are you sure "{
  19. vid}" is a valid URL?'
  20. super().__init__(self.message)
  21. class PlaylistError(Exception):
  22. def __init__(self, pid):
  23. self.message = f'Invalid Playlist ID. Are you sure "{
  24. pid}" is a valid URL and available?'
  25. super().__init__(self.message)
  26. def fetch_and_save_video_info():
  27. gMongoClient = pymongo.MongoClient()
  28. gMongoDb = gMongoClient['crawlers']
  29. gMongoCollection = gMongoDb['youtube']
  30. docs = gMongoCollection.find({
  31. })
  32. for doc in docs:
  33. videoId = doc['videoId']
  34. url = f"https://www.youtube.com/watch?v={
  35. videoId}"
  36. title = doc['title']
  37. shortViewCountText = doc['shortViewCountText']
  38. channelTitle = doc['channelTitle']
  39. channelId = doc['channelId']
  40. canonicalBaseUrl = doc['canonicalBaseUrl']
  41. subscriberCount = doc['subscriberCount']
  42. videosCount = doc['videosCount']
  43. headers = {
  44. 'user-agent': (
  45. 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'),
  46. 'referer': 'https://youtube.com'}
  47. vid = "".join([i for i in findall(r"v=(.*?)&|youtu.be\/(.*?)&", url + "&")[0]])
  48. logging.info(vid)
  49. json_file = load(open(cwd + "/tube_dl_config.json", "rb"))
  50. headers["x-youtube-client-version"] = json_file['cver']
  51. headers["x-youtube-client-name"] = json_file['cname']
  52. y_data = requests.get(url=f"https://youtube.com/watch?v={
  53. vid}&pbj=1", headers=headers,
  54. ).json()
  55. yt_data = [i for i in y_data if "playerResponse" in i.keys()][0]["playerResponse"]
  56. if yt_data["playabilityStatus"]["status"] == "ERROR":
  57. raise VideoError(vid)
  58. shortDescription = yt_data['videoDetails']['shortDescription']
  59. logging.info(dumps(y_data))
  60. # logging.info(shortDescription)
  61. # "label": "102,293 likes"
  62. likes = re.findall('"defaultText": \{"accessibility": \{"accessibilityData": \{"label": "(.*?) likes"\}\}',
  63. dumps(y_data))[0]
  64. try:
  65. if likes:
  66. # 格式化成数字
  67. likes = int(likes.replace(',', ''))
  68. except Exception as e:
  69. logging.info(e)
  70. likes = 0
  71. publishDate = yt_data['microformat']['playerMicroformatRenderer']['publishDate'] # 2023-11-04T16:00:11-07:00
  72. publishDateDay = publishDate.split('T')[0] # 2023-11-04
  73. viewCount = re.findall('"allowRatings": true, "viewCount": "(.*?)",', dumps(y_data))[0]
  74. if '万 个视频' in videosCount:
  75. videosCount = videosCount.replace('万 个视频', '')
  76. videosCount = float(videosCount) * 10000
  77. if '万位订阅者' in subscriberCount:
  78. subscriberCount = subscriberCount.replace('万位订阅者', '')
  79. subscriberCount = float(subscriberCount) * 10000
  80. item = {
  81. 'videoId': videoId,
  82. 'title': title,
  83. 'viewCount': viewCount,
  84. 'channelTitle': channelTitle,
  85. 'channelId': channelId,
  86. 'canonicalBaseUrl': canonicalBaseUrl,
  87. 'subscriberCount': subscriberCount,
  88. 'videosCount': videosCount,
  89. 'shortDescription': shortDescription,
  90. 'likes': likes,
  91. 'createTime': datetime.datetime.now(),
  92. 'day': datetime.datetime.now().strftime('%Y-%m-%d'),
  93. "publishDate": publishDate,
  94. "publishDateDay": publishDateDay,
  95. }
  96. gMongoDb.get_collection('youtube_detail_info').insert_one(item)
  97. def fetch_and_save_comments_info():
  98. gMongoClient = pymongo.MongoClient()
  99. gMongoDb = gMongoClient['crawlers']
  100. docs = gMongoDb.get_collection('youtube_detail_info').find({
  101. "commentsCount": {
  102. "$exists": False}})
  103. for doc in docs:
  104. url = f'https://www.youtube.com/watch?v={
  105. doc["videoId"]}'
  106. headers = {
  107. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  108. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  109. 'x-client-data': '自定义',
  110. 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '?1',
  111. 'Cookie': f"{
  112. cookie}",
  113. 'sec-fetch-dest': 'document', 'accept-encoding': 'gzip, deflate, br',
  114. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
  115. }
  116. resp = requests.get(url=url, headers=headers)
  117. response = scrapy.Selector(text=resp.text)
  118. token = re.findall(r'"continuationCommand":{"token":"(.*?)"', resp.text)[0]
  119. apiKey = re.findall(r'"innertubeApiKey":"(.*?)"', resp.text)[0]
  120. url = f'https://www.youtube.com/youtubei/v1/next?key={
  121. apiKey}&prettylogging.info=false'
  122. payload = {
  123. "context": {
  124. "client": {
  125. "deviceMake": "Apple", "deviceModel": "", "visitorData": "", "clientName": "WEB",
  126. "clientVersion": "2.20240123.01.00", "configInfo": {
  127. "appInstallData": ""},
  128. "mainAppWebInfo": {
  129. "graftUrl": ""}}, "user": {
  130. "lockedSafetyMode": False},
  131. "request": {
  132. "useSsl": True, "internalExperimentFlags": [], "consistencyTokenJars": []},
  133. "clickTracking": {
  134. "clickTrackingParams": ""}, "adSignalsInfo": {
  135. "params": [], "bid": ""}},
  136. "continuation": f"{
  137. token}"}
  138. videoResp = requests.post(url=url, headers=headers, json=payload)
  139. videoRespJson = json.loads(videoResp.text)
  140. try:
  141. commentsCount = re.findall(r'"commentsCount":\{"runs":\[\{"text":"(.*?)"\}\]\}', videoResp.text)[0]
  142. except Exception as e:
  143. logging.info(e)
  144. commentsCount = 0
  145. gMongoDb.get_collection('youtube_detail_info').update_one({
  146. '_id': doc['_id']},
  147. {
  148. '$set': {
  149. 'commentsCount': commentsCount}})
  150. def get_video_list(token, channelTitle, channelId, canonicalBaseUrl, subscriberCount, videosCount, gMongoDb):
  151. videlListUrl = 'https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettylogging.info=false'
  152. payload = {
  153. "context": {
  154. "client": {
  155. "deviceMake": "Apple", "deviceModel": "", "visitorData": "", "clientName": "WEB",
  156. "clientVersion": "2.20240123.01.00", "configInfo": {
  157. "appInstallData": ""},
  158. "mainAppWebInfo": {
  159. "graftUrl": ""}}, "user": {
  160. "lockedSafetyMode": False},
  161. "request": {
  162. "useSsl": True, "internalExperimentFlags": [], "consistencyTokenJars": []},
  163. "clickTracking": {
  164. "clickTrackingParams": ""}, "adSignalsInfo": {
  165. "params": [], "bid": ""}},
  166. "continuation": f"{
  167. token}"}
  168. headers = {
  169. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  170. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  171. 'x-client-data': '自定义',
  172. 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '?1',
  173. 'Cookie': f"{
  174. cookie}",
  175. 'sec-fetch-dest': 'document', 'accept-encoding': 'gzip, deflate, br',
  176. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
  177. }
  178. videoResp = requests.post(url=videlListUrl, headers=headers, json=payload, )
  179. videoRespJson = json.loads(videoResp.text)
  180. logging.info(videoResp.text)
  181. # .onResponseReceivedActions[0].appendContinuationItemsAction.continuationItems
  182. videoList = videoRespJson['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']
  183. for eVideo in videoList[:-1]:
  184. videoId = eVideo['richItemRenderer']['content']['videoRenderer']['videoId']
  185. title = eVideo['richItemRenderer']['content']['videoRenderer']['title']['runs'][0]['text']
  186. # richItemRenderer.content.videoRenderer.shortViewCountText.simpleText
  187. shortViewCountText = eVideo['richItemRenderer']['content']['videoRenderer']['shortViewCountText']['simpleText']
  188. item = {
  189. 'videoId': videoId,
  190. 'title': title,
  191. 'shortViewCountText': shortViewCountText,
  192. 'channelTitle': channelTitle,
  193. 'channelId': channelId,
  194. 'canonicalBaseUrl': canonicalBaseUrl,
  195. 'subscriberCount': subscriberCount,
  196. 'videosCount': videosCount,
  197. }
  198. logging.info(item)
  199. try:
  200. gMongoDb['youtube'].insert_one(item)
  201. except DuplicateKeyError as e:
  202. logging.info("重复数据")
  203. try:
  204. continuationCommand = videoList[-1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']
  205. token = continuationCommand['token']
  206. return get_video_list(token)
  207. except Exception as e:
  208. logging.info(e)
  209. logging.info('没有下一页了')
  210. def fetch_and_save_list_info():
  211. gMongoClient = pymongo.MongoClient()
  212. gMongoDb = gMongoClient['crawlers']
  213. url = 'https://www.youtube.com/@ganfutong/videos'
  214. headers = {
  215. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  216. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  217. 'x-client-data': '自定义',
  218. 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '?1',
  219. 'Cookie': f"{
  220. cookie}",
  221. 'sec-fetch-dest': 'document', 'accept-encoding': 'gzip, deflate, br',
  222. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
  223. }
  224. resp = requests.get(url=url, headers=headers,
  225. )
  226. response = scrapy.Selector(text=resp.text)
  227. token = re.findall(r'"continuationCommand":{"token":"(.*?)"', resp.text)[0]
  228. context = response.xpath("//script[contains(string(),'var ytInitialData')]/text()").extract_first()
  229. contextJson = json.loads(context.replace('var ytInitialData = ', '')[:-1])
  230. subscriberCount = contextJson['header']['c4TabbedHeaderRenderer']['subscriberCountText']['simpleText']
  231. videosCountTexts = contextJson['header']['c4TabbedHeaderRenderer']['videosCountText']['runs']
  232. channelTitle = contextJson['header']['c4TabbedHeaderRenderer']['title']
  233. channelId = contextJson['header']['c4TabbedHeaderRenderer']['channelId']
  234. canonicalBaseUrl = contextJson['header']['c4TabbedHeaderRenderer']['navigationEndpoint']['browseEndpoint'][
  235. 'canonicalBaseUrl']
  236. videosCount = ''
  237. for i in videosCountTexts:
  238. videosCount += i['text']
  239. token = token
  240. logging.info(token)
  241. get_video_list(token, channelTitle, channelId, canonicalBaseUrl, subscriberCount, videosCount, gMongoDb)
  242. def fetch_and_save_account_info():
  243. gMongoClient = pymongo.MongoClient()
  244. gMongoDb = gMongoClient['crawlers']
  245. url = 'https://www.youtube.com/@ganfutong'
  246. headers = {
  247. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  248. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  249. 'x-client-data': 'CIq2yQEIpbbJAQipncoBCLXsygEIk6HLAQia/swBCIagzQEIj+HNAQiE4s0BCN/rzQEI5uzNAQjB7s0BCIrvzQEIg/DNAQiG8M0BCL7xzQEIjPLNARj2yc0BGKfqzQEY+fLNAQ==',
  250. 'Cookie': f"{
  251. cookie}",
  252. 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'navigate', 'sec-fetch-user': '?1',
  253. 'sec-fetch-dest': 'document', 'accept-encoding': 'gzip, deflate, br',
  254. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
  255. }
  256. resp = requests.get(url=url, headers=headers)
  257. response = scrapy.Selector(text=resp.text)
  258. token = re.findall(r'"continuationCommand":{"token":"(.*?)"', resp.text)[0]
  259. context = response.xpath("//script[contains(string(),'var ytInitialData')]/text()").extract_first()
  260. contextJson = json.loads(context.replace('var ytInitialData = ', '')[:-1])
  261. # 订阅数.header.c4TabbedHeaderRenderer.subscriberCountText
  262. subscriberCount = contextJson['header']['c4TabbedHeaderRenderer']['subscriberCountText']['simpleText']
  263. # 视频数 .header.c4TabbedHeaderRenderer.videosCountText
  264. videosCountTexts = contextJson['header']['c4TabbedHeaderRenderer']['videosCountText']['runs']
  265. channelTitle = contextJson['header']['c4TabbedHeaderRenderer']['title']
  266. channelId = contextJson['header']['c4TabbedHeaderRenderer']['channelId']
  267. apiKey = re.findall(r'"innertubeApiKey":"(.*?)"', resp.text)[0]
  268. canonicalBaseUrl = contextJson['header']['c4TabbedHeaderRenderer']['navigationEndpoint']['browseEndpoint'][
  269. 'canonicalBaseUrl']
  270. videosCount = ''
  271. for i in videosCountTexts:
  272. videosCount += i['text']
  273. logging.info(json.dumps(contextJson))
  274. gMongoDb.get_collection('youtube_account_info').insert_one(item := {
  275. 'channelTitle': channelTitle,
  276. 'channelId': channelId,
  277. 'canonicalBaseUrl': canonicalBaseUrl,
  278. 'subscriberCount': subscriberCount,
  279. 'videosCount': videosCount,
  280. 'apiKey': apiKey,
  281. "createTime": datetime.datetime.now(),
  282. })
  283. if __name__ == '__main__':
  284. cookie = ''
  285. fetch_and_save_comments_info()

发表评论

表情:
评论列表 (有 0 条评论,163人围观)

还没有评论,来说两句吧...

相关阅读

    相关 Day7.数据采集-爬虫

    数据采集 我们进行数据分析以及挖掘时,前提条件就是需要有数据;如果在公司里作业,我们可以从数据库中导入数据,但同时我们也可以对采集数据来进行分析。采集数据最常用就是我们听