网络爬虫的采集,处理,存储

墨蓝 2021-09-13 22:40 463阅读 0赞

爬虫网址:http://www.bilibili.com/video/movie_west_1.html

  1. #coding:utf-8
  2. import _mysql,sys
  3. import time
  4. import socket
  5. import random
  6. import MySQLdb
  7. from Queue import Queue
  8. from threading import Thread
  9. from bs4 import BeautifulSoup
  10. from selenium import webdriver
  11. from selenium.webdriver.common.by import By
  12. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities #设置请求头
  13. create_sql = """create table movie(id int(2) not null primary key auto_increment, title varchar(200), href text )default charset=utf8; """
  14. User_Agent_list = [
  15. "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0",
  16. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
  17. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/9.1 Safari/534.50",
  18. "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0",
  19. "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36"
  20. "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50"
  21. "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50"
  22. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2"
  23. "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36"
  24. "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"
  25. "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)"
  26. "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)"
  27. ]
  28. def store(title,url):
  29. cur.execute("insert into movie (title,href) VALUES (%s,%s)",(title,url))
  30. cur.connection.commit()
  31. def get_source(url):
  32. #service_args = ['--proxy=localhost:9150','--proxy-type=sock5'] #代理设置
  33. dcap = dict(DesiredCapabilities.PHANTOMJS)
  34. dcap["phantomjs.page.settings.userAgent"] = (random.choice(User_Agent_list))
  35. driver = webdriver.PhantomJS(executable_path=r'D:\phantomjs-2.1.1\bin\phantomjs.exe', desired_capabilities=dcap) #,service_args=service_args ip代理设置
  36. try:
  37. driver.get(url)
  38. except Exception,e:
  39. print "此处出现异常,该异常信息为:%s" % e
  40. print "正在努力再尝试一次......"
  41. driver.get(url)
  42. time.sleep(5)
  43. html = driver.page_source
  44. driver.close()
  45. try:
  46. soup = BeautifulSoup(html, 'html.parser')
  47. a_list = soup.findAll("a", {
  48. "class": "preview"})
  49. a_page = soup.findAll("a", {
  50. "class":"p active"})[0]
  51. page = a_page.string
  52. except Exception,e:
  53. print "此处出现异常,该异常信息为:%s" % e
  54. print "\n"
  55. print "第 " + str(page) + " 页的电影集:\n\n"
  56. for a_each in a_list:
  57. movie_title = a_each.img['alt']
  58. movie_href = 'http://www.bilibili.com/' + a_each['href']
  59. print movie_title, movie_href
  60. print "正在向数据库导入电影信息:"
  61. try:
  62. store(movie_title,movie_href)
  63. except _mysql.Error,e:
  64. print("数据库的Error %d:%s" % (e.args[0], e.args[1]))
  65. store(movie_title, movie_href)
  66. print "电影信息导入完毕"
  67. if __name__ == "__main__":
  68. conn = MySQLdb.connect(host='127.0.0.1', user='root', passwd='123456', db='movie_info', port=3306,charset='utf8')
  69. cur = conn.cursor() # cur 光标对象
  70. cur.execute("use movie_info")
  71. cur.execute("drop table movie")
  72. cur.execute(create_sql)
  73. q = Queue()
  74. for i in xrange(1,706):
  75. newpage = "http://www.bilibili.com/video/movie_west_1.html#!page=" + str(i)
  76. q.put(newpage)
  77. print "\n电影爬虫开始......\n"
  78. for i in xrange(1,706):
  79. t = Thread(target=get_source, args=(q.get(),))
  80. t.setDaemon(True)
  81. t.start()
  82. time.sleep(3)
  83. #socket.setdefaulttimeout(50) # 设置10秒后连接超时
  84. t.join()
  85. print "\n数据抓取完毕\n"
  86. cur.close()
  87. conn.close()
  88. print "总部电影集导入数据库完毕"

发表评论

表情:
评论列表 (有 0 条评论,463人围观)

还没有评论,来说两句吧...

相关阅读

    相关 Day7.数据采集-爬虫

    数据采集 我们进行数据分析以及挖掘时,前提条件就是需要有数据;如果在公司里作业,我们可以从数据库中导入数据,但同时我们也可以对采集数据来进行分析。采集数据最常用就是我们听