【instagram】爬虫照片墙PC端数据采集

﹏ヽ暗。殇╰゛Y 2024-04-17 11:27 140阅读 0赞

喜欢一朵花,未必要将它摘下

在这里插入图片描述

采集案例

  1. package main
  2. import (
  3. "context"
  4. "encoding/json"
  5. "fmt"
  6. "go.mongodb.org/mongo-driver/mongo"
  7. "go.mongodb.org/mongo-driver/mongo/options"
  8. "io"
  9. "log"
  10. "net/http"
  11. "net/url"
  12. "strings"
  13. "sync"
  14. "time"
  15. )
  16. const (
  17. proxyURL = "http://127.0.0.1:2333"
  18. mongoURL = "mongodb://233.233.233.233:2333"
  19. )
  20. type blogInfo struct {
  21. UserID string `json:"userId" bson:"userId"`
  22. Title string `json:"title" bson:"title"`
  23. CommentCount float64 `json:"commentCount" bson:"commentCount"`
  24. LikeCount float64 `json:"likeCount" bson:"likeCount"`
  25. Shortcode string `json:"shortcode" bson:"shortcode"`
  26. PublishTime time.Time `json:"publishTime" bson:"publishTime"`
  27. CreateTime time.Time `json:"createTime" bson:"createTime"`
  28. }
  29. type UserProfile struct {
  30. UserID string `json:"userId" bson:"userId"`
  31. UserName string `json:"userName" bson:"userName"`
  32. Followers float64 `json:"followers" bson:"followers"`
  33. Biography string `json:"biography" bson:"biography"`
  34. Follow float64 `json:"follow" bson:"follow"`
  35. Blog float64 `json:"blog" bson:"blog"`
  36. CreateTime time.Time `json:"createTime" bson:"createTime"`
  37. }
  38. func crawlBlogInfo(userId string, endCursor string, mgClient *mongo.Client) (bool, string, error) {
  39. proxyURL, err := url.Parse(proxyURL)
  40. if err != nil {
  41. fmt.Println("Error parsing proxy URL:", err)
  42. return false, "", err
  43. }
  44. transport := &http.Transport{
  45. Proxy: http.ProxyURL(proxyURL),
  46. }
  47. client := &http.Client{
  48. Transport: transport,
  49. }
  50. params := `%7B"id"%3A"` + userId + `"%2C"after"%3A"` + endCursor + `"%2C"first"%3A12%7D`
  51. urlStr := fmt.Sprintf("https://www.instagram.com/graphql/query/?doc_id=%s&variables=%s", "自定义doc_id", params)
  52. req, err := http.NewRequest("GET", urlStr, nil)
  53. if err != nil {
  54. fmt.Println("Error creating request:", err)
  55. return false, "", err
  56. }
  57. req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
  58. req.Header.Set("x-ig-app-id", "app_id")
  59. resp, err := client.Do(req)
  60. if err != nil {
  61. fmt.Println("Error fetching the page:", err)
  62. return false, "", err
  63. }
  64. defer resp.Body.Close()
  65. body, err := io.ReadAll(resp.Body)
  66. if err != nil {
  67. fmt.Println("Error reading response body:", err)
  68. return false, "", err
  69. }
  70. var jsonData map[string]interface{}
  71. err = json.Unmarshal(body, &jsonData)
  72. if err != nil {
  73. fmt.Println("Error parsing JSON:", err)
  74. return false, "", err
  75. }
  76. var rootPath []interface{}
  77. if data, ok := jsonData["data"].(map[string]interface{}); ok {
  78. if user, ok := data["user"].(map[string]interface{}); ok {
  79. if edgeOwnerToTimelineMedia, ok := user["edge_owner_to_timeline_media"].(map[string]interface{}); ok {
  80. if edges, ok := edgeOwnerToTimelineMedia["edges"].([]interface{}); ok {
  81. rootPath = edges
  82. }
  83. }
  84. }
  85. }
  86. if rootPath == nil {
  87. fmt.Println("Unexpected JSON structure")
  88. return false, "", nil
  89. }
  90. for _, item := range rootPath {
  91. titleNode := item.(map[string]interface{})["node"].(map[string]interface{})["edge_media_to_caption"].(map[string]interface{})["edges"].([]interface{})
  92. var title string
  93. if len(titleNode) == 0 {
  94. log.Println("title : No data available in edges")
  95. } else {
  96. title = item.(map[string]interface{})["node"].(map[string]interface{})["edge_media_to_caption"].(map[string]interface{})["edges"].([]interface{})[0].(map[string]interface{})["node"].(map[string]interface{})["text"].(string)
  97. }
  98. if title != "" {
  99. commentCount := item.(map[string]interface{})["node"].(map[string]interface{})["edge_media_to_comment"].(map[string]interface{})["count"].(float64)
  100. likeCount := item.(map[string]interface{})["node"].(map[string]interface{})["edge_media_preview_like"].(map[string]interface{})["count"].(float64)
  101. shortcode := item.(map[string]interface{})["node"].(map[string]interface{})["shortcode"].(string)
  102. tm := item.(map[string]interface{})["node"].(map[string]interface{})["taken_at_timestamp"].(float64)
  103. timestamp := time.Unix(int64(tm), 0)
  104. insBlogInfo := mgClient.Database("crawlers").Collection("ins_blog_info_v1")
  105. insBlogItem := blogInfo{
  106. UserID: userId,
  107. Title: title,
  108. CommentCount: commentCount,
  109. LikeCount: likeCount,
  110. Shortcode: shortcode,
  111. PublishTime: timestamp,
  112. CreateTime: time.Now(),
  113. }
  114. insertOneRes, insertErr := insBlogInfo.InsertOne(context.TODO(), insBlogItem)
  115. if insertErr != nil {
  116. insertFlag := strings.Index(insertErr.Error(), "[E11000 duplicate key error collection")
  117. if insertFlag != -1 {
  118. fmt.Println("DATA REPEAT!!!")
  119. continue
  120. } else {
  121. fmt.Println("CODE ERROR!!!")
  122. fmt.Println(insertErr.Error())
  123. continue
  124. }
  125. } else {
  126. fmt.Printf("写入成功:%v\n", insertOneRes.InsertedID)
  127. }
  128. } else {
  129. fmt.Println("title is empty")
  130. continue
  131. }
  132. }
  133. hasNextPage := jsonData["data"].(map[string]interface{})["user"].(map[string]interface{})["edge_owner_to_timeline_media"].(map[string]interface{})["page_info"].(map[string]interface{})["has_next_page"].(bool)
  134. endCursorNext := jsonData["data"].(map[string]interface{})["user"].(map[string]interface{})["edge_owner_to_timeline_media"].(map[string]interface{})["page_info"].(map[string]interface{})["end_cursor"].(string)
  135. return hasNextPage, endCursorNext, nil
  136. }
  137. func crawlUserProfile(wg *sync.WaitGroup, urlStr string, mgClient *mongo.Client) {
  138. defer wg.Done()
  139. proxyURL, err := url.Parse(proxyURL)
  140. if err != nil {
  141. fmt.Println("Error parsing proxy URL:", err)
  142. return
  143. }
  144. transport := &http.Transport{
  145. Proxy: http.ProxyURL(proxyURL),
  146. }
  147. client := &http.Client{
  148. Transport: transport,
  149. }
  150. req, err := http.NewRequest("GET", urlStr, nil)
  151. if err != nil {
  152. fmt.Println("Error creating request:", err)
  153. return
  154. }
  155. req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
  156. req.Header.Set("x-ig-app-id", "自定义app_id")
  157. resp, err := client.Do(req)
  158. if err != nil {
  159. fmt.Println("Error fetching the page:", err)
  160. return
  161. }
  162. defer resp.Body.Close()
  163. body, err := io.ReadAll(resp.Body)
  164. if err != nil {
  165. fmt.Println("Error reading response body:", err)
  166. return
  167. }
  168. var jsonData map[string]interface{}
  169. err = json.Unmarshal(body, &jsonData)
  170. if err != nil {
  171. fmt.Println("Error parsing JSON:", err)
  172. return
  173. }
  174. var biography string = jsonData["data"].(map[string]interface{})["user"].(map[string]interface{})["biography"].(string)
  175. followers := jsonData["data"].(map[string]interface{})["user"].(map[string]interface{})["edge_followed_by"].(map[string]interface{})["count"].(float64)
  176. fllow := jsonData["data"].(map[string]interface{})["user"].(map[string]interface{})["edge_follow"].(map[string]interface{})["count"].(float64)
  177. blog := jsonData["data"].(map[string]interface{})["user"].(map[string]interface{})["edge_owner_to_timeline_media"].(map[string]interface{})["count"].(float64)
  178. userName := jsonData["data"].(map[string]interface{})["user"].(map[string]interface{})["edge_owner_to_timeline_media"].(map[string]interface{})["edges"].([]interface{})[0].(map[string]interface{})["node"].(map[string]interface{})["owner"].(map[string]interface{})["username"].(string)
  179. userId := jsonData["data"].(map[string]interface{})["user"].(map[string]interface{})["edge_owner_to_timeline_media"].(map[string]interface{})["edges"].([]interface{})[0].(map[string]interface{})["node"].(map[string]interface{})["owner"].(map[string]interface{})["id"].(string)
  180. insUserProfile := mgClient.Database("crawlers").Collection("ins_user_profile")
  181. insUserItem := UserProfile{
  182. UserID: userId,
  183. UserName: userName,
  184. Followers: followers,
  185. Biography: biography,
  186. Follow: fllow,
  187. Blog: blog,
  188. CreateTime: time.Now(),
  189. }
  190. fmt.Println(insUserItem)
  191. insertOneRes, insertErr := insUserProfile.InsertOne(context.TODO(), insUserItem)
  192. if insertErr != nil {
  193. insertFlag := strings.Index(insertErr.Error(), "[E11000 duplicate key error collection")
  194. if insertFlag != -1 {
  195. fmt.Println("DATA REPEAT!!!")
  196. } else {
  197. fmt.Println("CODE ERROR!!!")
  198. fmt.Println(insertErr.Error())
  199. }
  200. } else {
  201. fmt.Printf("写入成功:%v\n", insertOneRes.InsertedID)
  202. }
  203. rootPath := jsonData["data"].(map[string]interface{})["user"].(map[string]interface{})["edge_owner_to_timeline_media"].(map[string]interface{})["edges"].([]interface{})
  204. for _, item := range rootPath {
  205. titleNode := item.(map[string]interface{})["node"].(map[string]interface{})["edge_media_to_caption"].(map[string]interface{})["edges"].([]interface{})
  206. var title string
  207. if len(titleNode) == 0 {
  208. log.Println("title : No data available in edges")
  209. } else {
  210. title = item.(map[string]interface{})["node"].(map[string]interface{})["edge_media_to_caption"].(map[string]interface{})["edges"].([]interface{})[0].(map[string]interface{})["node"].(map[string]interface{})["text"].(string)
  211. }
  212. commentCount := item.(map[string]interface{})["node"].(map[string]interface{})["edge_media_to_comment"].(map[string]interface{})["count"].(float64)
  213. likeCount := item.(map[string]interface{})["node"].(map[string]interface{})["edge_media_preview_like"].(map[string]interface{})["count"].(float64)
  214. shortcode := item.(map[string]interface{})["node"].(map[string]interface{})["shortcode"].(string)
  215. tm := item.(map[string]interface{})["node"].(map[string]interface{})["taken_at_timestamp"].(float64)
  216. timestamp := time.Unix(int64(tm), 0)
  217. insBlogInfo := mgClient.Database("crawlers").Collection("ins_blog_info")
  218. insBlogItem := blogInfo{
  219. UserID: userId,
  220. Title: title,
  221. CommentCount: commentCount,
  222. LikeCount: likeCount,
  223. Shortcode: shortcode,
  224. PublishTime: timestamp,
  225. CreateTime: time.Now(),
  226. }
  227. insertOneRes, insertErr := insBlogInfo.InsertOne(context.TODO(), insBlogItem)
  228. if insertErr != nil {
  229. insertFlag := strings.Index(insertErr.Error(), "[E11000 duplicate key error collection")
  230. if insertFlag != -1 {
  231. fmt.Println("DATA REPEAT!!!")
  232. } else {
  233. fmt.Println("CODE ERROR!!!")
  234. fmt.Println(insertErr.Error())
  235. }
  236. } else {
  237. fmt.Printf("写入成功:%v\n", insertOneRes.InsertedID)
  238. }
  239. }
  240. hasNextPage := jsonData["data"].(map[string]interface{})["user"].(map[string]interface{})["edge_owner_to_timeline_media"].(map[string]interface{})["page_info"].(map[string]interface{})["has_next_page"].(bool)
  241. endCursor := jsonData["data"].(map[string]interface{})["user"].(map[string]interface{})["edge_owner_to_timeline_media"].(map[string]interface{})["page_info"].(map[string]interface{})["end_cursor"].(string)
  242. for hasNextPage {
  243. hasNextPage, endCursor, err = crawlBlogInfo(userId, endCursor, mgClient)
  244. if err != nil {
  245. fmt.Println("Error fetching the page:", err)
  246. } else {
  247. fmt.Println("NextPage:", endCursor)
  248. }
  249. }
  250. }
  251. func main() {
  252. clientOptions := options.Client().ApplyURI(mongoURL)
  253. mgClient, cllectErr := mongo.Connect(context.TODO(), clientOptions)
  254. if cllectErr != nil {
  255. log.Fatal(cllectErr)
  256. }
  257. pingErr := mgClient.Ping(context.TODO(), nil)
  258. if pingErr != nil {
  259. log.Fatal(pingErr)
  260. }
  261. urls := []string{
  262. "https://www.instagram.com/api/v1/users/web_profile_info/?username=visit_nanchang",
  263. }
  264. var wg sync.WaitGroup
  265. for _, url := range urls {
  266. wg.Add(1)
  267. go crawlUserProfile(&wg, url, mgClient)
  268. }
  269. wg.Wait() // 等待所有goroutine完成
  270. fmt.Println("所有请求完成")
  271. }

发表评论

表情:
评论列表 (有 0 条评论,140人围观)

还没有评论,来说两句吧...

相关阅读

    相关 CSS照片

    开发工具与关键技术:DW 作者:文泽钦 撰写时间:2019年1月31日 今天跟大家分享一个css照片墙,代码也很简单,能省略的效果也省略了,大家也不要学我,不然学校也

    相关 照片

    开发工具与关键技术:Adobe Dreamweaver CC 2017 CSS 作者:廖亚星 撰写时间:2019年2月15日 下面是让多张照片错乱分布在页面上,当移