Python and Beautiful Soup to Html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
| 今天有这个需求,需要去拉站点的html的文件然后解析想要的几个内容.
pycurl: http://pycurl.sourceforge.net/doc/index.html
BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
这两个py modules应用的时候可以在ipython试探性的使用查看某个属性和某个函数,help更直白
在此做个小笔记作今天的学习笔记.
代码有点小戳呀。。嘿嘿
py code
#!/usr/bin/env python
# -*- coding:utf-8 -*-
'''
Author:mdk
Email:mengdaikun@gmail.com
Date:2014-8-29
py解析html
'''
import pycurl
import StringIO
from bs4 import BeautifuSoup
#get html file from xxx.com
def get_html():
url = 'http://xxx.com'
ret = pycurl.Curl()
ret.setopt(ret.URL,url)
temp = StringIO.StringIO()
ret.setopt(ret.WRITEFUNCTION,temp1.write)
ret.setopt(ret.FOLLOWLOCATION,1)
ret.setopt(ret.HEADER,True)
ret.perform()
html = temp.getvalue()
temp.close()
ret.close()
return html
def get_url():
html = get_html()
soup = BeautifulSoup(html)
urls = soup.find("ul",id="cui_nav_ul")#查找ul id=cui_nav_ul的body内容
for i in urls.find_all('a'): #查找所有a标签
if i.get('id'): #a标签下的id内容
if 'c_ph' in i.get('id'): #id的内容含c_ph
try:
print i.get('title').strip(),i.get('href').strip()
except:
pass
if __name__ == '__main__':
get_url()
欢迎加入群里讨论相关技术 群号: 250244637
|