博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
【python爬虫】coursera抓取
阅读量:5340 次
发布时间:2019-06-15

本文共 5851 字,大约阅读时间需要 19 分钟。

1 # -*- coding: utf-8 -*-”  2 #!/usr/bin/env python  3   4 """  5 用于抓取coursera网站的下载链接  6 """  7   8 import sys  9 import string 10 import re,random 11 import urllib,urllib2 12 import cookielib 13 import getpass 14  15  16 class Coursera(object): 17     """Coursera类定义 18      19             实现模拟登陆,抓取网页代码和正则匹配,保存连接到文件 20      21     Attributes: 22         login_url:保存真正的登陆页面URL 23         url:保存用于爬取下载连接的URL 24         user_name:存储用户登陆Email 25         password:存储用户登陆密码 26     """ 27      28     def __init__(self,url,user_name,password): 29         self.login_url = "https://accounts.coursera.org/api/v1/login" 30         self.url = url 31         if user_name == "" or password == "": 32             raise UserOrPwdNone("the username or password can't empty string") 33             sys.exit(2) 34         else : 35             self.user_name=user_name 36             self.password = password 37      38     def simulation_login(self): 39         """ 40                     模拟登录函数 41         """ 42          43         cookie = cookielib.CookieJar() 44         opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) 45         urllib2.install_opener(opener) 46         form_data,request_header = self.structure_headers() 47         req = urllib2.Request(self.login_url,data = form_data,headers=request_header) 48         try: 49             result = urllib2.urlopen(req) 50         except urllib2.URLError,e: 51             if hasattr(e, "code"): 52                 print "The server couldn't fulfill the request.Please check your url and read the Reason" 53                 print "Error code: %s" % e.code 54             elif hasattr(e, "reason"): 55                 print "We failed to reach a server. Please check your url and read the Reason" 56                 print "Reason: %s" % e.reason 57             sys.exit(2) 58         if result.getcode()==200: 59             print "登录成功..." 60              61     def structure_headers(self): 62         """ 63                     头部构造函数 64         """ 65         #模拟表单数据,这个参数不是字典 66         form_data = urllib.urlencode({ 67             "email":self.user_name, 68             "password":self.password, 69             "webrequest":"true" 70         }) 71         user_agent = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " 72             "AppleWebKit/537.36 (KHTML, like Gecko) " 73             "Chrome/38.0.2125.111 Safari/537.36") 74         XCSRF2Cookie = 'csrf2_token_%s' % ''.join(self.random_string(8)) 75         XCSRF2Token = ''.join(self.random_string(24)) 76         XCSRFToken = ''.join(self.random_string(24)) 77         cookie = "csrftoken=%s; %s=%s" % (XCSRFToken, XCSRF2Cookie, XCSRF2Token) 78          79         request_header = { 80             "Referer": "https://accounts.coursera.org/signin",  #对付防盗链设置, 为跳转来源的url 81             "User-Agent": user_agent, #伪装成浏览器访问 82             "X-Requested-With": "XMLHttpRequest", 83             "X-CSRF2-Cookie": XCSRF2Cookie, 84             "X-CSRF2-Token": XCSRF2Token, 85             "X-CSRFToken": XCSRFToken, 86             "Cookie": cookie 87         } 88          89         return form_data,request_header 90      91     def random_string(self,length): 92         """ 93                     随机生成指定长度的字母和数字序列 94         """ 95         return  ''.join(random.choice(string.letters + string.digits) for i in xrange(length)) 96      97     def get_links(self): 98         """ 99                     爬取页面代码,获取下载MP4和PDF连接100         """101         102         try:103             result = urllib2.urlopen(self.url)104         except urllib2.URLError,e:105             if hasattr(e, "code"):106                 print "The server couldn't fulfill the request."107                 print "Error code: %s" % e.code108             elif hasattr(e, "reason"):109                 print "We failed to reach a server. Please check your url and read the Reason"110                 print "Reason: %s" % e.reason111             sys.exit(2)112         content = result.read().decode("utf-8")113         print "读取网页成功..."114         down_links = re.findall(r'
")156 password = getpass.getpass("Input your Password > ")157 """158 url = "https://class.coursera.org/{course}/lecture"159 user_name = "15258691200@163.com"160 password = "xxxxxxx"161 spider = Coursera(url.format(course = "python"),user_name,password)162 spider.start_spider()163 164 if __name__ == '__main__':165 main()

 

通过谷歌浏览器的network工具分析http请求头中的内容,然后自己定义,模拟登陆。

对比发现:请求头中X-CSRF2-Token和X-CSRFToken是完全随机的,X-CSRF2-Cookie后8位是随机生成的,字母和数字。

 

于是就有了这样的请求头代码:

def structure_headers(self) :        #模拟表单数据,这个参数不是字典        form_data = urllib.urlencode({            "email": self.user_name,            "password": self.password,            "webrequest": "true"        })          user_agent = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) "            "AppleWebKit/537.36 (KHTML, like Gecko) "            "Chrome/38.0.2125.111 Safari/537.36")        XCSRF2Cookie = 'csrf2_token_%s' % ''.join(self.random_string(8))        XCSRF2Token = ''.join(self.random_string(24))        XCSRFToken = ''.join(self.random_string(24))        cookie = "csrftoken=%s; %s=%s" % (XCSRFToken, XCSRF2Cookie, XCSRF2Token)        request_header = {            "Referer": "https://accounts.coursera.org/signin",  #对付防盗链设置, 为跳转来源的url            "User-Agent": user_agent, #伪装成浏览器访问            "X-Requested-With": "XMLHttpRequest",            "X-CSRF2-Cookie": XCSRF2Cookie,            "X-CSRF2-Token": XCSRF2Token,            "X-CSRFToken": XCSRFToken,            "Cookie": cookie        }        return form_data, request_header    def random_string(self, length):        return ''.join(random.choice(string.letters + string.digits) for i in xrange(length))
View Code

 

最后的运行结果:

因为输入的请求下载链接不正确,所以下载的长度都是0

转载于:https://www.cnblogs.com/fjl-vxee/p/6694923.html

你可能感兴趣的文章
学习RESTFul架构
查看>>
分析语句执行步骤并对排出耗时比较多的语句
查看>>
原生JS轮播-各种效果的极简实现
查看>>
软件工程总结作业---提问回顾与个人总结
查看>>
计数器方法使用?
查看>>
带你全面了解高级 Java 面试中需要掌握的 JVM 知识点
查看>>
sonar结合jenkins
查看>>
解决VS+QT无法生成moc文件的问题
查看>>
AngularJs练习Demo14自定义服务
查看>>
stat filename
查看>>
关于空想X
查看>>
CF1067C Knights 构造
查看>>
[BZOJ2938] 病毒
查看>>
webstorm修改文件,webpack-dev-server不会自动编译刷新
查看>>
Scikit-learn 库的使用
查看>>
CSS: caption-side 属性
查看>>
python 用数组实现队列
查看>>
认证和授权(Authentication和Authorization)
查看>>
Mac上安装Tomcat
查看>>
CSS3中box-sizing的理解
查看>>