1 # -*- coding: utf-8 -*-” 2 #!/usr/bin/env python 3 4 """ 5 用于抓取coursera网站的下载链接 6 """ 7 8 import sys 9 import string 10 import re,random 11 import urllib,urllib2 12 import cookielib 13 import getpass 14 15 16 class Coursera(object): 17 """Coursera类定义 18 19 实现模拟登陆,抓取网页代码和正则匹配,保存连接到文件 20 21 Attributes: 22 login_url:保存真正的登陆页面URL 23 url:保存用于爬取下载连接的URL 24 user_name:存储用户登陆Email 25 password:存储用户登陆密码 26 """ 27 28 def __init__(self,url,user_name,password): 29 self.login_url = "https://accounts.coursera.org/api/v1/login" 30 self.url = url 31 if user_name == "" or password == "": 32 raise UserOrPwdNone("the username or password can't empty string") 33 sys.exit(2) 34 else : 35 self.user_name=user_name 36 self.password = password 37 38 def simulation_login(self): 39 """ 40 模拟登录函数 41 """ 42 43 cookie = cookielib.CookieJar() 44 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) 45 urllib2.install_opener(opener) 46 form_data,request_header = self.structure_headers() 47 req = urllib2.Request(self.login_url,data = form_data,headers=request_header) 48 try: 49 result = urllib2.urlopen(req) 50 except urllib2.URLError,e: 51 if hasattr(e, "code"): 52 print "The server couldn't fulfill the request.Please check your url and read the Reason" 53 print "Error code: %s" % e.code 54 elif hasattr(e, "reason"): 55 print "We failed to reach a server. Please check your url and read the Reason" 56 print "Reason: %s" % e.reason 57 sys.exit(2) 58 if result.getcode()==200: 59 print "登录成功..." 60 61 def structure_headers(self): 62 """ 63 头部构造函数 64 """ 65 #模拟表单数据,这个参数不是字典 66 form_data = urllib.urlencode({ 67 "email":self.user_name, 68 "password":self.password, 69 "webrequest":"true" 70 }) 71 user_agent = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " 72 "AppleWebKit/537.36 (KHTML, like Gecko) " 73 "Chrome/38.0.2125.111 Safari/537.36") 74 XCSRF2Cookie = 'csrf2_token_%s' % ''.join(self.random_string(8)) 75 XCSRF2Token = ''.join(self.random_string(24)) 76 XCSRFToken = ''.join(self.random_string(24)) 77 cookie = "csrftoken=%s; %s=%s" % (XCSRFToken, XCSRF2Cookie, XCSRF2Token) 78 79 request_header = { 80 "Referer": "https://accounts.coursera.org/signin", #对付防盗链设置, 为跳转来源的url 81 "User-Agent": user_agent, #伪装成浏览器访问 82 "X-Requested-With": "XMLHttpRequest", 83 "X-CSRF2-Cookie": XCSRF2Cookie, 84 "X-CSRF2-Token": XCSRF2Token, 85 "X-CSRFToken": XCSRFToken, 86 "Cookie": cookie 87 } 88 89 return form_data,request_header 90 91 def random_string(self,length): 92 """ 93 随机生成指定长度的字母和数字序列 94 """ 95 return ''.join(random.choice(string.letters + string.digits) for i in xrange(length)) 96 97 def get_links(self): 98 """ 99 爬取页面代码,获取下载MP4和PDF连接100 """101 102 try:103 result = urllib2.urlopen(self.url)104 except urllib2.URLError,e:105 if hasattr(e, "code"):106 print "The server couldn't fulfill the request."107 print "Error code: %s" % e.code108 elif hasattr(e, "reason"):109 print "We failed to reach a server. Please check your url and read the Reason"110 print "Reason: %s" % e.reason111 sys.exit(2)112 content = result.read().decode("utf-8")113 print "读取网页成功..."114 down_links = re.findall(r'")156 password = getpass.getpass("Input your Password > ")157 """158 url = "https://class.coursera.org/{course}/lecture"159 user_name = "15258691200@163.com"160 password = "xxxxxxx"161 spider = Coursera(url.format(course = "python"),user_name,password)162 spider.start_spider()163 164 if __name__ == '__main__':165 main()
通过谷歌浏览器的network工具分析http请求头中的内容,然后自己定义,模拟登陆。
对比发现:请求头中X-CSRF2-Token和X-CSRFToken是完全随机的,X-CSRF2-Cookie后8位是随机生成的,字母和数字。
于是就有了这样的请求头代码:
def structure_headers(self) : #模拟表单数据,这个参数不是字典 form_data = urllib.urlencode({ "email": self.user_name, "password": self.password, "webrequest": "true" }) user_agent = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/38.0.2125.111 Safari/537.36") XCSRF2Cookie = 'csrf2_token_%s' % ''.join(self.random_string(8)) XCSRF2Token = ''.join(self.random_string(24)) XCSRFToken = ''.join(self.random_string(24)) cookie = "csrftoken=%s; %s=%s" % (XCSRFToken, XCSRF2Cookie, XCSRF2Token) request_header = { "Referer": "https://accounts.coursera.org/signin", #对付防盗链设置, 为跳转来源的url "User-Agent": user_agent, #伪装成浏览器访问 "X-Requested-With": "XMLHttpRequest", "X-CSRF2-Cookie": XCSRF2Cookie, "X-CSRF2-Token": XCSRF2Token, "X-CSRFToken": XCSRFToken, "Cookie": cookie } return form_data, request_header def random_string(self, length): return ''.join(random.choice(string.letters + string.digits) for i in xrange(length))
最后的运行结果:
因为输入的请求下载链接不正确,所以下载的长度都是0