urlli2¶
发送请求¶
以GET形式发送数据¶
import urllib.request
response = urllib.request.urlopen('http://www.baidu.com')
print(response.read().decode("utf-8"))
以POST形式发送数据¶
import urllib.parse
import urllib.request
# POST类型需要传输data参数
data = bytes(urllib.parse.urlencode({"world":"hello"}), encoding = 'utf-8')
response = urllib.request.urlopen('http://httpbin.org/post', data = data)
print(response.read())
timeout参数¶
如果访问超时,抛出异常。
import socket
import urllib.request
import urllib.error
try:
response = urllib.request.urlopen("http://httpbin.org/get", timeout = 0.1)
except urllib.error.URLError as e:
if isinstance(e.reason, socket.timeout):
print("TIME OUT")
响应请求¶
响应类型¶
import urllib.request
response = urllib.request.urlopen("http://www.python.org")
print(type(response))
状态码、响应头¶
import urllib.request
response = urllib.request.urlopen("http://www.python.org")
print(response.status)
print(response.getheaders())
print(response.getheader('Server'))
获取响应体内容¶
import urllib.request
response = urllib.request.urlopen("http://www.python.org")
print(response.read().decode('utf-8'))
使用Request对象¶
import urllib.request
request = urllib.request.Request("http://python.org")
response = urllib.request.urlopen(request)
print(response.read().decode('utf-8'))
这样操作实现结果和上面写的获取响应体内容实现结果是一样的。不过,我们使用Request对象之后可以进行一些更复杂的操作。
from urllib import request, parse
url = 'http://httpbin.org/post'
headers = {
'User-Agent':'Mozilla/4.0', 'Host':'httpbin.org'
}
dict = {
'name':'Germey'
}
data = bytes(parse.urlopen(dict)), encoding = 'utf-8')
req = request.Requests(url = url, data = data, headers = headers, method = 'POST')
response = request.urlopen(req)
print(response.read().decode('utf-8'))
使用Handler¶
许多高级操作,比如说FTP或者操作cache需要使用Handler来进行。
代理¶
import urllib.request
proxy_handler = urllib.request.ProxyHandler({
'http':'http://127.0.0.1:9743',
'https':'https://127.0.0.1:9743'
})
opener = urllib.request.bulid_opener(proxy_handler)
response = opener.open('http://www.baid.com')
print(response.read())
Cookie¶
Cookie是在客户端保留,用来记录用户身份的文本文件。在爬虫中,使用Cookie主要用来维护登录状态的一个机制。
import http.cookiejar, urllib.request
cookie = http.cookiejar.CookieJar() #使用cookie首先需要声明为一个CookieJar的对象
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
# 打印cookie
for item in cookie:
print(item.name +"="+ item.value)
因为,cookie是用来维持用户登录信息的,所以,我们常常把cookie保存为一个文本文件。用爬虫请求的时候把Cookie发送给服务器。下面展示一下如何把cookie保存成为一个文本文件。Mozilla
样式的cookie保存格式
import http.cookiejar, urllib.request
filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(filename) #需要声明为CookieJar的一个子类对象,声明好会带有一个save方法
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard = True, ignore_expires = True)
也可以保存为LWP
Cookie保存格式:
import http.cookiejar, urllib.request
filename = 'cookie.txt'
cookie = http.cookiejar.LWPCookieJar(filename) #需要声明为CookieJar的一个子类对象,声明好会带有一个save方法
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard = True, ignore_expires = True)
不过,无论使用哪种格式,其实没有太大关系,使用哪种格式来存cookie,最后使用哪种格式来读取cookie就好了。
import http.cookiejar, urllib.request
filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(filename)
cookie.load(filename, ignore_discard = True, ignore_expires = True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))
异常处理¶
from urllib import request, error
try:
response = request.urlopen('http://cuiqingcai.com/index.htm')
except error.URLError as e:
print(e.reason) #使用e.reason打印异常原因
urllib.error
一共有两种error,分别是URLError
和HTTPError
,其中URLError只有一个reason方法,而HTTPError有三种方法,一般捕获异常的时候回这样写:
from urllib import request, error
try:
response = request.urlopen('http://cuiqingcai.com/index.htm')
except error.HTTPError as e:
print(e.reason, e.code, e.headers, sep = '\n')
except error.URLError as e:
print(e.reason)
else:
print('Request Successfully')
还可以用这种方法来验证异常:
import socket
import urllib.request
import urllib.error
try:
response = urllib.request.urlopen("http://httpbin.org/get", timeout = 0.1)
except urllib.error.URLError as e:
if isinstance(e.reason, socket.timeout):
print("TIME OUT")
超时异常的话e.reason是
<class 'socket.timeout>
类型,通过isinstance
进行类型匹配来验证异常。
URL解析¶
urlparse¶
urllib.parse.urlparse(urlstring, scheme=’’, allow_fragments=True)
urlparse的功能就是对url进行分割,分割成不同的部分。
from urllib.parse import urlparse
result = urlparse('http://www.baidu.com/index.html;user?id=5#comment')
print(type(result), result)
结果是: <class ‘urllib.parse.ParseResult’> ParseResult(scheme=’http’, netloc=’www.baidu.com’, path=’/index.html’, params=’user’, query=’id=5’, fragment=’comment’)
还可以指定解析的协议
from urllib.parse import urlparse
result = urlparse('www.baidu.com/index.html;user?id=5#comment', scheme='https')
print(result)
但是如果网页本身有协议类型,这时候用scheme
参数指定协议类型就不会生效了。
from urllib.parse import urlparse
result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', allow_fragments=False)
print(result)
这样,fragment就会拼接到前面的位置去,输出结果为:
ParseResult(scheme=’http’, netloc=’www.baidu.com’, path=’/index.html’, params=’user’, query=’id=5#comment’, fragment=’’)
如果query
也是空的话,fragment的值如果是false的话,值会拼接到更前面去。
urlunparse¶
用这个方法把片段拼成一个url
from urllib.parse import urlunparse
data = ['http', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment']
print(urlunparse(data))
urljoin¶
这个也是用来拼URL,但是更为强大。
from urllib.parse import urljoin
print(urljoin('http://www.baidu.com', 'FAQ.html'))
print(urljoin('http://www.baidu.com', 'https://cuiqingcai.com/FAQ.html'))
print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html'))
print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html?question=2'))
print(urljoin('http://www.baidu.com?wd=abc', 'https://cuiqingcai.com/index.php'))
print(urljoin('http://www.baidu.com', '?category=2#comment'))
print(urljoin('www.baidu.com', '?category=2#comment'))
print(urljoin('www.baidu.com#comment', '?category=2'))
输出结果
http://www.baidu.com/FAQ.html #这是将两个片段拼在一起
https://cuiqingcai.com/FAQ.html #后面的字段名会自动覆盖掉前面的字段
https://cuiqingcai.com/FAQ.html
https://cuiqingcai.com/FAQ.html?question=2
https://cuiqingcai.com/index.php
http://www.baidu.com?category=2#comment
www.baidu.com?category=2#comment
www.baidu.com?category=2
前面可以看到,整个URL可以分成6个字段,如果后面的字段不完成,不能构成一个URL,最后就会用前面的字段来补充。如果后面的是存在的,那么就以后面的为基准。
urlencode¶
可以把一个字典对象转换成get请求参数。按照&
进行分割。因为很多情况下,URL都是以字典形式构造的。
from urllib.parse import urlencode
params = {
'name': 'germey',
'age': 22
}
base_url = 'http://www.baidu.com?'
url = base_url + urlencode(params)
print(url)
结果:http://www.baidu.com?name=germey&age=22
Requests¶
Requests是用Python语言编写的,基于urllib,但是它比urllib更加方便,可以节约我们大量的工作,完全满足HTTP测试需要。
GET请求¶
基本请求¶
import requests
response = requests.get('http://httpbin.org/get')
print(response.text)
就可以打印基本请求信息
{
"args": {},
"headers": {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Connection": "close",
"Host": "httpbin.org",
"User-Agent": "python-requests/2.20.0"
},
"origin": "222.197.179.9",
"url": "http://httpbin.org/get"
}
带参数GET¶
import requests
response = requests.get("http://httpbin.org/get?name=germey&age=22")
print(response.text)
和下面的写法是完全一样的,下面写法的好处是不用写各种符号了。
import requests
data = {
'name': 'germey',
'age': 22
}
response = requests.get("http://httpbin.org/get", params=data)
print(response.text)
解析json¶
这个操作可以直接将获取的结果转化为json格式。
import requests
import json
response = requests.get("http://httpbin.org/get")
print(type(response.text))
print(response.json()) # response.json实际上就是执行了一个json.loads方法,和下面语句效果是一样的
print(json.loads(response.text))
print(type(response.json()))
获取二进制数据¶
获取二进制数据在下载图片和下载视频的时候作为常用方法。
import requests
response = requests.get("https://github.com/favicon.ico")
print(type(response.text), type(response.content))
print(response.text)
print(response.content) #使用response.content来获取图片的二进制内容
保存的话,可以使用这种方式来保存:
import requests
response = requests.get("https://github.com/favicon.ico")
with open('favicon.ico', 'wb') as f:
f.write(response.content)
f.close()
添加headers¶
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
response = requests.get("https://www.zhihu.com/explore", headers=headers)
print(response.text)
POST请求¶
使用POST请求一定要发一个Form Data,我们可以用data字典来传入表单。
import requests
data = {'name': 'germey', 'age': '22'}
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
response = requests.post("http://httpbin.org/post", data=data, headers=headers)
print(response.json())
响应¶
response属性¶
下面列出常用的response属性。
import requests
response = requests.get('http://www.jianshu.com')
print(type(response.status_code), response.status_code)
print(type(response.headers), response.headers)
print(type(response.cookies), response.cookies)
print(type(response.url), response.url)
print(type(response.history), response.history)
状态码判断¶
下面给出状态码的各个编号对应的名字,我们在实际使用过程中不需要记住状态码的具体数字,使用名字是一样的效果。
100: ('continue',),
101: ('switching_protocols',),
102: ('processing',),
103: ('checkpoint',),
122: ('uri_too_long', 'request_uri_too_long'),
200: ('ok', 'okay', 'all_ok', 'all_okay', 'all_good', '\\o/', '✓'),
201: ('created',),
202: ('accepted',),
203: ('non_authoritative_info', 'non_authoritative_information'),
204: ('no_content',),
205: ('reset_content', 'reset'),
206: ('partial_content', 'partial'),
207: ('multi_status', 'multiple_status', 'multi_stati', 'multiple_stati'),
208: ('already_reported',),
226: ('im_used',),
# Redirection.
300: ('multiple_choices',),
301: ('moved_permanently', 'moved', '\\o-'),
302: ('found',),
303: ('see_other', 'other'),
304: ('not_modified',),
305: ('use_proxy',),
306: ('switch_proxy',),
307: ('temporary_redirect', 'temporary_moved', 'temporary'),
308: ('permanent_redirect',
'resume_incomplete', 'resume',), # These 2 to be removed in 3.0
# Client Error.
400: ('bad_request', 'bad'),
401: ('unauthorized',),
402: ('payment_required', 'payment'),
403: ('forbidden',),
404: ('not_found', '-o-'),
405: ('method_not_allowed', 'not_allowed'),
406: ('not_acceptable',),
407: ('proxy_authentication_required', 'proxy_auth', 'proxy_authentication'),
408: ('request_timeout', 'timeout'),
409: ('conflict',),
410: ('gone',),
411: ('length_required',),
412: ('precondition_failed', 'precondition'),
413: ('request_entity_too_large',),
414: ('request_uri_too_large',),
415: ('unsupported_media_type', 'unsupported_media', 'media_type'),
416: ('requested_range_not_satisfiable', 'requested_range', 'range_not_satisfiable'),
417: ('expectation_failed',),
418: ('im_a_teapot', 'teapot', 'i_am_a_teapot'),
421: ('misdirected_request',),
422: ('unprocessable_entity', 'unprocessable'),
423: ('locked',),
424: ('failed_dependency', 'dependency'),
425: ('unordered_collection', 'unordered'),
426: ('upgrade_required', 'upgrade'),
428: ('precondition_required', 'precondition'),
429: ('too_many_requests', 'too_many'),
431: ('header_fields_too_large', 'fields_too_large'),
444: ('no_response', 'none'),
449: ('retry_with', 'retry'),
450: ('blocked_by_windows_parental_controls', 'parental_controls'),
451: ('unavailable_for_legal_reasons', 'legal_reasons'),
499: ('client_closed_request',),
# Server Error.
500: ('internal_server_error', 'server_error', '/o\\', '✗'),
501: ('not_implemented',),
502: ('bad_gateway',),
503: ('service_unavailable', 'unavailable'),
504: ('gateway_timeout',),
505: ('http_version_not_supported', 'http_version'),
506: ('variant_also_negotiates',),
507: ('insufficient_storage',),
509: ('bandwidth_limit_exceeded', 'bandwidth'),
510: ('not_extended',),
511: ('network_authentication_required', 'network_auth', 'network_authentication'),
下面两种写法实现功能是一样的:
import requests
response = requests.get('http://www.jianshu.com/hello.html')
exit() if not response.status_code == requests.codes.not_found else print('404 Not Found')
import requests
response = requests.get('http://www.jianshu.com')
exit() if not response.status_code == 200 else print('Request Successfully')
高级操作¶
文件上传¶
文件上传需要使用POST操作。我们使用open把文件读取出来,然后赋值给files参数就可以实现文件上传操作了。
import requests
files = {'file': open('favicon.ico', 'rb')}
response = requests.post("http://httpbin.org/post", files=files)
print(response.text)
获取cookie¶
我们可以直接从cookies属性就可以获取cookie的具体信息了。
import requests
response = requests.get("https://www.baidu.com")
print(response.cookies)
for key, value in response.cookies.items():
print(key + '=' + value)
会话维持¶
有了cookies就可以模拟维持一个登录状态了。可以用来做模拟登陆。
需要注意的是,我们如果想要设置一个cookies,然后读出这个cookies需要设置一个Session对象。如果不设置Session对象,那么相当于两个浏览器在进行get。第二个浏览器是拿不到第一个浏览器的请求的。通过Session这样来维持一个登录会话。
import requests
s = requests.Session()
s.get('http://httpbin.org/cookies/set/number/123456789')
response = s.get('http://httpbin.org/cookies')
print(response.text)
证书验证¶
我们在浏览网页的时候,网页证书可能是错误的,直接使用requests就会报出一个SSL认证的错误。我们爬网页的时候如果这个网页是https协议的话,他会首先检查一下证书的合法性。如果证书不合法,就会报出SSL Error,那么程序就会中断掉。如果想要避免这个问题,关掉证书验证功能就可以了。
import requests
response = requests.get('https://www.12306.cn', verify=False)
print(response.status_code)
但是,依旧会有警告信息,如果想要看起来好看一些,可以添加下面代码把警告信息消除掉。
import requests
from requests.packages import urllib3
urllib3.disable_warnings()
response = requests.get('https://www.12306.cn', verify=False)
print(response.status_code)
代理设置¶
import requests
proxies = {
"http": "http://127.0.0.1:9743",
"https": "https://127.0.0.1:9743",
}
response = requests.get("https://www.taobao.com", proxies=proxies)
print(response.status_code)
如果代理需要用户名和密码的话,可以这样写,其中user的位置写用户名,password的位置写密码。@作为分隔符。
import requests
proxies = {
"http": "http://user:password@127.0.0.1:9743/",
}
response = requests.get("https://www.taobao.com", proxies=proxies)
print(response.status_code)
如果你的代理是ShadowSocks的话,那么可以这样:
pip3 install 'requests[socks]'
import requests
proxies = {
'http': 'socks5://127.0.0.1:9742',
'https': 'socks5://127.0.0.1:9742'
}
response = requests.get("https://www.taobao.com", proxies=proxies)
print(response.status_code)
超时设置¶
import requests
from requests.exceptions import ReadTimeout #引入Timeout类
try:
response = requests.get("http://httpbin.org/get", timeout = 0.5)
print(response.status_code)
except ReadTimeout:
print('Timeout')
认证设置¶
有些网站需要输入用户名和密码,登录的时候需要进行认证。可以通过给requests传入一个auth
参数来做到登录验证。
import requests
from requests.auth import HTTPBasicAuth
r = requests.get('http://120.27.34.24:9001', auth=HTTPBasicAuth('user', '123'))
print(r.status_code)
或者也可以写成这样:
import requests
r = requests.get('http://120.27.34.24:9001', auth=('user', '123'))
print(r.status_code)
异常处理¶
这些具体异常Error的定义和其他有什么Error可以从Requests官方文档中找到。
import requests
from requests.exceptions import ReadTimeout, ConnectionError, RequestException
try:
response = requests.get("http://httpbin.org/get", timeout = 0.5)
print(response.status_code)
except ReadTimeout:
print('Timeout')
except ConnectionError:
print('Connection error')
except RequestException:
print('Error')
正则表达式¶
正则表达式是对字符串的一种逻辑公式,就是用事先定义好的一些特定字符、以及这些特定字符的组合,组成一个“规则字符串”,这个“规则字符串”用来表达对字符串的一种过滤逻辑。
image
re.match¶
re.match尝试从字符串的起始位置匹配一个模式,如果不是起始位置匹配成功的话,match()就会返回none。
re.match(pattern, string, flags = 0)
最常规的匹配¶
import re
content = 'Hello 123 4567 World_This is a Regex Demo'
print(len(content))
result = re.match('^Hello\s\d\d\d\s\d{4}\s\w{10}.*Demo$', content)
print(result)
print(result.group()) #返回匹配结果
print(result.span()) #输出匹配结果的范围
泛匹配¶
之前写的匹配过于复杂,如果想要实现同样的效果可以有比较简单的书写格式。可是使用.*
来匹配所有的字符串。
import re
content = 'Hello 123 4567 World_This is a Regex Demo'
result = re.match('^Hello.*Demo$', content) #使用.*就可以把中间的所有结果都匹配过去了
print(result)
print(result.group())
print(result.span())
匹配目标¶
比如说在下面的字符串里面想要获取1234567,我们可以把1234567的左端点和右端点指定一下。之后用括号括起来,这样就可以把匹配目标找出来了。
1234567的左端点是一个空白字符,所以用\s,之后要匹配的是一个数字,所以用(\d+),之后用\s来指定右端点,再后面就无所谓了。
import re
content = 'Hello 1234567 World_This is a Regex Demo'
result = re.match('^Hello\s(\d+)\sWorld.*Demo$', content)
print(result)
print(result.group(1))
print(result.span())
贪婪匹配¶
import re
content = 'Hello 1234567 World_This is a Regex Demo'
result = re.match('^He.*(\d+).*Demo$', content)
print(result)
print(result.group(1))
输出结果:
<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This is a Regex Demo'>
7
从图中可以看出,贪婪模式的话,前面.*
会一直匹配数字,一直到7位置,因为后面有(\d+),必须输出一个数字,所以一直匹配到最后输出一个7。
非贪婪匹配¶
非贪婪匹配,可以在.*
后面加一个?
,这样匹配的结果就是非贪婪匹配。非贪婪匹配会尽可能匹配少的字符。
import re
content = 'Hello 1234567 World_This is a Regex Demo'
result = re.match('^He.*?(\d+).*Demo$', content)
print(result)
print(result.group(1))
输出为:
<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This is a Regex Demo'>
1234567
?后面有一个\d+,说明要开始匹配数字了,尽可能匹配少的数字,于是就把这些东西全部输出了。
匹配模式¶
import re
content = '''Hello 1234567 World_This
is a Regex Demo
'''
result = re.match('^He.*?(\d+).*?Demo$', content, re.S) #匹配模式就是这里的re.S
print(result.group(1))
这里的content是两行,中间隔着一个换行符,如果不加re.S
那么到换行符位置就停止了,不会匹配后面的内容。加了这个匹配模式之后就可以匹配任意字符了。
转义¶
我们有的时候还想得到一些特殊字符,比如说$5.00,我们想把这个dollar字符匹配出来,但是这个符号在正则表达式里面还是一个特殊字符,不好匹配。这是,可以加转义字符达到匹配的效果。
import re
content = 'price is $5.00'
result = re.match('price is \$5\.00', content)
print(result)
总结:尽量使用泛匹配、使用括号得到匹配目标、尽量使用非贪婪模式、有换行符就用re.S
re.search¶
re.match有一个不太方便的地方是他会从第一个字符开始匹配,即第一个字符如果不匹配,就不检查后面的字符了。
re.search会扫描整个字符串并返回第一个成功的匹配。所以,一般情况下,能用search就不用match。
import re
html = '''<div id="songs-list">
<h2 class="title">经典老歌</h2>
<p class="introduction">
经典老歌列表
</p>
<ul id="list" class="list-group">
<li data-view="2">一路上有你</li>
<li data-view="7">
<a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
</li>
<li data-view="4" class="active">
<a href="/3.mp3" singer="齐秦">往事随风</a>
</li>
<li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li>
<li data-view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a></li>
<li data-view="5">
<a href="/6.mp3" singer="邓丽君"><i class="fa fa-user"></i>但愿人长久</a>
</li>
</ul>
</div>'''
result = re.search('<li.*?active.*?singer="(.*?)">(.*?)</a>', html, re.S)
if result:
print(result.group(1), result.group(2)) #group(1)代表第一个小括号匹配的内容,group(2)代表第二个小括号括起来的内容
输出结果是:齐秦 往事随风
这里的active表示的是去匹配带有active的标签。
下面看一下如果没有active的情况,HTML还是之前的HTML。
import re
html = '''<div id="songs-list">
<h2 class="title">经典老歌</h2>
<p class="introduction">
经典老歌列表
</p>
<ul id="list" class="list-group">
<li data-view="2">一路上有你</li>
<li data-view="7">
<a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
</li>
<li data-view="4" class="active">
<a href="/3.mp3" singer="齐秦">往事随风</a>
</li>
<li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li>
<li data-view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a></li>
<li data-view="5">
<a href="/6.mp3" singer="邓丽君">但愿人长久</a>
</li>
</ul>
</div>'''
result = re.search('<li.*?singer="(.*?)">(.*?)</a>', html, re.S)
if result:
print(result.group(1), result.group(2))
输出结果是:任贤齐 沧海一声笑
re.findall¶
re.search是找到一个结果,re.findall是返回所有匹配结果。
import re
html = '''<div id="songs-list">
<h2 class="title">经典老歌</h2>
<p class="introduction">
经典老歌列表
</p>
<ul id="list" class="list-group">
<li data-view="2">一路上有你</li>
<li data-view="7">
<a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
</li>
<li data-view="4" class="active">
<a href="/3.mp3" singer="齐秦">往事随风</a>
</li>
<li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li>
<li data-view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a></li>
<li data-view="5">
<a href="/6.mp3" singer="邓丽君">但愿人长久</a>
</li>
</ul>
</div>'''
results = re.findall('<li.*?href="(.*?)".*?singer="(.*?)">(.*?)</a>', html, re.S)
print(results)
print(type(results))
for result in results:
print(result)
print(result[0], result[1], result[2])
输出结果:
[('/2.mp3', '任贤齐', '沧海一声笑'), ('/3.mp3', '齐秦', '往事随风'), ('/4.mp3', 'beyond', '光辉岁月'), ('/5.mp3', '陈慧琳', '记事本'), ('/6.mp3', '邓丽君', '但愿人长久')]
<class 'list'>
('/2.mp3', '任贤齐', '沧海一声笑')
/2.mp3 任贤齐 沧海一声笑
('/3.mp3', '齐秦', '往事随风')
/3.mp3 齐秦 往事随风
('/4.mp3', 'beyond', '光辉岁月')
/4.mp3 beyond 光辉岁月
('/5.mp3', '陈慧琳', '记事本')
/5.mp3 陈慧琳 记事本
('/6.mp3', '邓丽君', '但愿人长久')
/6.mp3 邓丽君 但愿人长久
下面提出更高要求。比如说“一路有你”,没有歌曲位置和唱者。想要实现一个泛化能力更强的代码,可以这样写:
import re
html = '''<div id="songs-list">
<h2 class="title">经典老歌</h2>
<p class="introduction">
经典老歌列表
</p>
<ul id="list" class="list-group">
<li data-view="2">一路上有你</li>
<li data-view="7">
<a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
</li>
<li data-view="4" class="active">
<a href="/3.mp3" singer="齐秦">往事随风</a>
</li>
<li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li>
<li data-view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a></li>
<li data-view="5">
<a href="/6.mp3" singer="邓丽君">但愿人长久</a>
</li>
</ul>
</div>'''
results = re.findall('<li.*?>\s*?(<a.*?>)?(\w+)(</a>)?\s*?</li>', html, re.S)
print(results)
for result in results:
print(result[1])
输出结果是:
[('', '一路上有你', ''), ('<a href="/2.mp3" singer="任贤齐">', '沧海一声笑', '</a>'), ('<a href="/3.mp3" singer="齐秦">', '往事随风', '</a>'), ('<a href="/4.mp3" singer="beyond">', '光辉岁月', '</a>'), ('<a href="/5.mp3" singer="陈慧琳">', '记事本', '</a>'), ('<a href="/6.mp3" singer="邓丽君">', '但愿人长久', '</a>')]
一路上有你
沧海一声笑
往事随风
光辉岁月
记事本
但愿人长久
详细解释一下这段正则表达式的代码:<li.*?>
这一段表示的是li标签的开头和结尾。
这里我们还面临一个换行和不换行的问题,在“齐秦”这个位置,我们的li标签面临一个换行的问题,而在其他位置是不存在换行的事情的。所以我们写\s*?
其中,\s表示匹配换行符,但是这个换行符可能有也可能没有,所以加上*?
然后匹配一下a标签,但是这个标签也是可能有可能没有的。这里使用(<a.*?>)?
,小括号还有一个功能是把括起来的内容看做一个整体,括号外面加一个问号,表示匹配了一个或者是0个,表达了有a标签或者没有a标签的意思。
接下面匹配歌名,使用(\w+)
(</a>)?
a标签或者有或者没有作为一个整体
\s*?
换行空白符或者有或者没有
最后加上一个</li>
作为结尾
最后说一下小括号的事情,从这个例子可以看出,小括号既能够作为一个整体,对小括号里面的东西进行匹配,同时小括号还可以作为一个分组,将打印结果输出出来。
re.sub¶
这个方法是用来做字符串替换的,第一个参数是要传入一个正则表达式,第二个参数是要传入你要替换成的东西,第三个参数是源字符串。
import re
content = 'Extra stings Hello 1234567 World_This is a Regex Demo Extra stings'
content = re.sub('\d+', 'Replacement', content)
print(content)
输出结果为:Extra stings Hello Replacement World_This is a Regex Demo Extra stings。
可以看到,123457就被替换为我们想要替换的参数了。
但是有一个问题,如果我们想要的是对原生字符串的改进而不是单纯的替换,那么该怎么做。
import re
content = 'Extra stings Hello 1234567 World_This is a Regex Demo Extra stings'
content = re.sub('(\d+)', r'\1 8910', content)
print(content)
输出结果是:Extra stings Hello 1234567 8910 World_This is a Regex Demo Extra stings
我们对正则表达式要匹配的内容加一个括号,通过之前的例子可以知道,括号里面的内容可以通过group(1)这样的形式拿过来。为了防止\
被转义,所以前面加一个r。这里\1就是把第一个括号里面的内容拿过来,做一个替换。
使用sub之后对之前findall方法实施有一个新的思路,就是先把a标签删除,这样拿出歌名就更容易。
import re
html = '''<div id="songs-list">
<h2 class="title">经典老歌</h2>
<p class="introduction">
经典老歌列表
</p>
<ul id="list" class="list-group">
<li data-view="2">一路上有你</li>
<li data-view="7">
<a href="/2.mp3" singer="任贤齐">沧海一声笑</a>
</li>
<li data-view="4" class="active">
<a href="/3.mp3" singer="齐秦">往事随风</a>
</li>
<li data-view="6"><a href="/4.mp3" singer="beyond">光辉岁月</a></li>
<li data-view="5"><a href="/5.mp3" singer="陈慧琳">记事本</a></li>
<li data-view="5">
<a href="/6.mp3" singer="邓丽君">但愿人长久</a>
</li>
</ul>
</div>'''
html = re.sub('<a.*?>|</a>', '', html)
print(html)
results = re.findall('<li.*?>(.*?)</li>', html, re.S)
print(results)
for result in results:
print(result.strip())
结果是:
<div id="songs-list">
<h2 class="title">经典老歌</h2>
<p class="introduction">
经典老歌列表
</p>
<ul id="list" class="list-group">
<li data-view="2">一路上有你</li>
<li data-view="7">
沧海一声笑
</li>
<li data-view="4" class="active">
往事随风
</li>
<li data-view="6">光辉岁月</li>
<li data-view="5">记事本</li>
<li data-view="5">
但愿人长久
</li>
</ul>
</div>
['一路上有你', '\n 沧海一声笑\n ', '\n 往事随风\n ', '光辉岁月', '记事本', '\n 但愿人长久\n ']
一路上有你
沧海一声笑
往事随风
光辉岁月
记事本
但愿人长久
re.compile¶
这个方法是将正则表达式编译成正则表达式对象。我们可以把匹配模式做成一个正则表达式对象,以后不用重新再写匹配模式,直接调用这个匹配对象就可以了。
import re
content = '''Hello 1234567 World_This
is a Regex Demo'''
pattern = re.compile('Hello.*Demo', re.S)
result = re.match(pattern, content)
#result = re.match('Hello.*Demo', content, re.S)
print(result)
练习¶
import requests
import re
content = requests.get('https://book.douban.com/').text
pattern = re.compile('<li.*?cover.*?href="(.*?)".*?title="(.*?)".*?more-meta.*?author">(.*?)</span>.*?year">(.*?)</span>.*?</li>', re.S)
results = re.findall(pattern, content)
for result in results:
url, name, author, date = result
author = re.sub('\s', '', author)
date = re.sub('\s', '', date)
print(url, name, author, date)
Beautiful Soup¶
这是一个灵活又方便的网页解析库,处理高效,支持多种解析器。利用它不同编写正则表达式就可以方便的实现网页信息提取。
安装:pip install beautifulsoup4
解析库¶
image
基本使用¶
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.prettify()) #格式化代码,自动进行代码补全,容错处理
print(soup.title.string) #将title内容以string形式输出
输出效果如下:
<html>
<head>
<title>
The Dormouse's story
</title>
</head>
<body>
<p class="title" name="dromouse">
<b>
The Dormouse's story
</b>
</p>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">
<!-- Elsie -->
</a>
,
<a class="sister" href="http://example.com/lacie" id="link2">
Lacie
</a>
and
<a class="sister" href="http://example.com/tillie" id="link3">
Tillie
</a>
;
and they lived at the bottom of a well.
</p>
<p class="story">
...
</p>
</body>
</html>
The Dormouse's story
标签选择器¶
选择元素¶
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.title) #选择title标签
print(type(soup.title))
print(soup.head) #选择head标签
print(soup.p) #选择p标签
输出结果:
<title>The Dormouse's story</title>
<class 'bs4.element.Tag'>
<head><title>The Dormouse's story</title></head>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
可以看到标签选择器,对于如果有多个标签的话,只会输出访问到匹配的第一个标签。
获取名称¶
获取名称指的是获取标签的名称。
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.title.name)
输出:title
这是把最外层标签的名称打印输出出来。
获取属性¶
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.p.attrs['name'])
print(soup.p['name'])
输出结果:
dromouse
dromouse
两种获取标签属性的效果是一样的。(获取name属性的值)
获取内容¶
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.p.string)
输出结果是:The Dormouse’s story
实现输出第一个p标签结果的内容。
嵌套选择¶
还可以用层层递进的方式进行嵌套选择。
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.head.title.string)
输出结果:The Dormouse’s story
子节点和子孙节点¶
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.p.contents) #将所有子节点内容以列表的形式进行返回
输出结果:
['\n Once upon a time there were three little sisters; and their names were\n ', <a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>, '\n', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, ' \n and\n ', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, '\n and they lived at the bottom of a well.\n ']
与contents
方法不同的是children
是一个迭代器,用print输出的话,不会输出一个列表,而是迭代器形式的格式,需要通过迭代打印的办法才可以输出我们看起来想要的结果。
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.p.children)
for i, child in enumerate(soup.p.children):
print(i, child)
输出:
<list_iterator object at 0x1064f7dd8>
0
Once upon a time there were three little sisters; and their names were
1 <a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>
2
3 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
4
and
5 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
6
and they lived at the bottom of a well.
还有一个decendands
的属性,返回的结果也是迭代器类型的。用于获取所有子孙节点。
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.p.descendants)
for i, child in enumerate(soup.p.descendants):
print(i, child)
输出结果是:
<generator object descendants at 0x10650e678>
0
Once upon a time there were three little sisters; and their names were
1 <a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>
2
3 <span>Elsie</span>
4 Elsie
5
6
7 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
8 Lacie
9
and
10 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
11 Tillie
12
and they lived at the bottom of a well.
之前我们只是获取a标签里面的内容,这里我们对于a标签的字标签<span>
的内容也可以获取到了。
父节点和祖先节点¶
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.a.parent) #a标签的父节点
输出结果:
<p class="story">
Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
这是获取第一个a标签对应的父节点内容。
下面展示获取祖先节点,父节点以及其父亲的父亲节点:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(list(enumerate(soup.a.parents)))
输出结果:
[(0, <p class="story">
Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>), (1, <body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
</body>), (2, <html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
</body></html>), (3, <html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">
<span>Elsie</span>
</a>
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
</body></html>)]
兄弟节点¶
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(list(enumerate(soup.a.next_siblings)))
print(list(enumerate(soup.a.previous_siblings)))
输出结果
[(0, '\n'), (1, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>), (2, ' \n and\n '), (3, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>), (4, '\n and they lived at the bottom of a well.\n ')]
[(0, '\n Once upon a time there were three little sisters; and their names were\n ')]
标准选择器¶
find_all( name , attrs , recursive , text , **kwargs )¶
可根据标签名、属性、内容查找文档
name¶
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all('ul'))
print(type(soup.find_all('ul')[0]))
输出结果:
[<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>, <ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>]
<class 'bs4.element.Tag'>
可以看到输出的是列表样式。
还可以通过遍历的方式来提取元素:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
for ul in soup.find_all('ul'):
print(ul.find_all('li'))
输出:
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
[<li class="element">Foo</li>, <li class="element">Bar</li>]
这样,来进行层层嵌套的查找。
attrs¶
要求传输参数的类型应该是一个字典形式,按照键值对的规则。
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(attrs={'id': 'list-1'}))
print(soup.find_all(attrs={'name': 'elements'}))
输出结果:
[<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>]
[<ul class="list" id="list-1" name="elements">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>]
实现同样的功能还有一个更简单的办法,不需要使用attrs:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(id='list-1'))
print(soup.find_all(class_='element'))
结果是:
[<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>]
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]
这里,有一个需要注意的地方时class
是python里面的一个关键字,所以不能直接作为参数。所以要写成class_
。
text¶
text就是根据文本内容进行选择。需要注意的是,这里的查找只是对标签里面的文本内容进行查找,不会找到标签。
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(text='Foo'))
输出结果是:[‘Foo’, ‘Foo’]
find( name , attrs , recursive , text , **kwargs )¶
find返回单个元素,find_all返回所有元素
find返回到的是匹配的第一个值。其他用法和find_all是完全一样的。
find_parents() find_parent()¶
find_parents()返回所有祖先节点,find_parent()返回直接父节点
find_next_siblings() find_next_sibling()¶
find_next_siblings()返回后面所有兄弟节点,find_next_sibling()返回后面第一个兄弟节点
find_previous_siblings() find_previous_sibling()¶
find_previous_siblings()返回前面所有兄弟节点,find_previous_sibling()返回前面第一个兄弟节点
find_all_next() find_next()¶
find_all_next()返回节点后所有符合条件的节点, find_next()返回第一个符合条件的节点
find_all_previous() 和 find_previous()¶
find_all_previous()返回节点后所有符合条件的节点, find_previous()返回第一个符合条件的节点
CSS选择器¶
CSS是层叠样式表(英文全称:Cascading Style Sheets),是一种用来表现HTML(标准通用标记语言的一个应用)或XML(标准通用标记语言的一个子集)等文件样式的计算机语言。CSS不仅可以静态地修饰网页,还可以配合各种脚本语言动态地对网页各元素进行格式化。
通过select()直接传入CSS选择器即可完成选择。
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.select('.panel .panel-heading'))
print(soup.select('ul li'))
print(soup.select('#list-2 .element'))
print(type(soup.select('ul')[0]))
输出结果是:
[<div class="panel-heading">
<h4>Hello</h4>
</div>]
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]
[<li class="element">Foo</li>, <li class="element">Bar</li>]
<class 'bs4.element.Tag'>
这里介绍一下几种select的选择方式。
第一种:选择panel
里面的panel-heading
标签。通过符号.
和空格的方式进行选择。
第二种:选择所有ul标签里面的li标签。
第三种:通过id进行选择,独有的标志是#。之后加空格和点符号,就可以选择想要选的内容了。
层层迭代的方式还可以这样写:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('ul'):
print(ul.select('li'))
输出结果是:
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
[<li class="element">Foo</li>, <li class="element">Bar</li>]
获取属性¶
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('ul'):
print(ul['id']) #两种获取属性的方法是一样的
print(ul.attrs['id'])
结果是:
list-1
list-1
list-2
list-2
获取内容¶
获取标签里面的文本内容。
html='''
<div class="panel">
<div class="panel-heading">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
for li in soup.select('li'):
print(li.get_text())
输出结果是:
Foo
Bar
Jay
Foo
Bar
总结¶
- 推荐使用lxml解析库,解决不了,或者标签混乱时用html.parser
- 标签选择筛选功能弱但是速度快
- 建议使用find()、find_all() 查询匹配单个结果或者多个结果
- 如果对CSS选择器熟悉建议使用select()
- 记住常用的获取属性和文本值的方法
PyQuery¶
强大又灵活的网页解析库,如果觉得正则表达式写起来太复杂,BeautifulSoup语法太难记得话。如果你熟悉jQuery的语法,那么PyQuery是你绝佳的选择。
安装:pip3 install pyquery
初始化¶
字符串初始化¶
html = '''
<div>
<ul>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('li'))
输出为:
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
这个选择器是一个CSS选择器,实现效果和Beautiful Soup里面的CSS选择器效果是一样的。选id的话前面加#
,如果选class,前面加.
,选标签的话,前面什么也不加。
URL初始化¶
如果传入一个链接的话,会自动请求链接,并把链接得到的HTML当做参数,传给doc。
from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
print(doc('head'))
文件初始化¶
可以指定filename,读取文件路径的HTML。
from pyquery import PyQuery as pq
doc = pq(filename='demo.html')
print(doc('li'))
基本CSS选择器¶
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('#container .list li'))
输出结果是:
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
这个选择方式层层推进的方式进行选择。这里是查找id为container,class为list的li标签。注意中间需要用空格隔开。
查找元素¶
子元素¶
find会查找当前元素里面满足CSS选择器的所有元素。
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
print(type(items))
print(items)
lis = items.find('li')
print(type(lis))
print(lis)
输出结果是:
<class 'pyquery.pyquery.PyQuery'>
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
<class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
这样就可以实现层层嵌套的选择了。
下面的用法是查找直接子元素。
lis = items.children()
print(type(lis))
print(lis)
也可以对子元素实现CSS选择器。
lis = items.children('.active')
print(lis)
父元素¶
每一个标签外面只可能有一个父元素,可以通过parent
来获取父元素。
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
container = items.parent()
print(type(container))
print(container)
输出为:
<class 'pyquery.pyquery.PyQuery'>
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
parents返回所有祖先节点。
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
parents = items.parents()
print(type(parents))
print(parents)
当然也可以对祖先节点加入CSS选择器。
parent = items.parents('.wrap')
print(parent)
兄弟元素¶
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.list .item-0.active')
print(li.siblings())
输出为:
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0">first item</li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
这里选择器有一点不同的地方在于.item-0
后面没有加空格,直接就是.active
不加空格表示并列的意思,表示同时包含.item-0
和.active
的class。
同样的,我们也可以加入CSS选择器。
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.list .item-0.active')
print(li.siblings('.active'))
遍历¶
这里介绍一下如何对查找的元素进行遍历。
单个元素¶
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
输出:
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
多个元素¶
但是这样只会匹配一个单个元素,如果你要查找多个元素,这样就需要用到item
来遍历了。
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
lis = doc('li').items()
print(type(lis))
for li in lis:
print(li)
输出结果:
<class 'generator'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
获取信息¶
获取属性¶
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a') #空格表示选择里面的内容,就是a里面的内容
print(a)
print(a.attr('href')) #获取属性
print(a.attr.href) #这两种方法是一样的,都可以用来获取属性信息
输出为:
<a href="link3.html"><span class="bold">third item</span></a>
link3.html
link3.html
获取文本¶
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
print(a.text())
输出为:
<a href="link3.html"><span class="bold">third item</span></a>
third item
获取HTML¶
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
print(li.html())
输出为:
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<a href="link3.html"><span class="bold">third item</span></a>
DOM操作¶
addClass、removeClass¶
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.removeClass('active')
print(li)
li.addClass('active')
print(li)
输出为:
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
修改attrs、css¶
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.attr('name', 'link') #如果没有name属性,会添加name=link,如果本来有name属性,会修改为name=link
print(li)
li.css('font-size', '14px')
print(li)
输出为:
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active" name="link" style="font-size: 14px"><a href="link3.html"><span class="bold">third item</span></a></li>
remove¶
html = '''
<div class="wrap">
Hello, World
<p>This is a paragraph.</p>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())
wrap.find('p').remove() #先用find方法找到p标签,之后用remove移除p标签
print(wrap.text())
输出为:
Hello, World This is a paragraph.
Hello, World
其他DOM方法¶
http://pyquery.readthedocs.io/en/latest/api.html
伪类选择器¶
可以用CSS3的一些伪类选项器,选择一些特定的元素。
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('li:first-child')
print(li)
li = doc('li:last-child')
print(li)
li = doc('li:nth-child(2)') #指定索引顺序,获取第二个元素标签。
print(li)
li = doc('li:gt(2)') #greater than的缩写,获取序号比2大的标签
print(li)
li = doc('li:nth-child(2n)') #获取偶数标签,如果是奇数可以写成2n+1
print(li)
li = doc('li:contains(second)') #获取指定内容的标签
print(li)
输出为:
<li class="item-0">first item</li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
更多CSS选择器可以查看 http://www.w3school.com.cn/css/index.asp
官方文档¶
http://pyquery.readthedocs.io/
示例:利用爬虫获取天气信息¶
利用爬虫获取天气信息,并通过邮件每天定时发送到特定邮箱
爬取的网站:中国天气网 <www.weather.com.cn/weather/101270101.shtml >
程序编写¶
爬取天气信息¶
打开中国天气网成都市的天气预报,右键进入“检查”,进入“Network”,刷新查看第一个请求并“Preview”,发现显示出了我们所看到的网页。我们直接爬去该网页就可以了。 进入“Element”,选择左上角的“Select an element in the page to inpect it”按钮,找到关键元素。
关键元素选取
利用 BeautifulSoup 抓取日期、天气和温度三个信息,将函数封装,程序如下:
import requests
from bs4 import BeautifulSoup
def get_weather():
url = 'http://www.weather.com.cn/weather/101270101.shtml'
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
}
res = requests.get(url,headers=headers)
res.encoding='utf-8'
soup = BeautifulSoup(res.text,'html.parser')
date = soup.find(class_='t').find('h1')
wea = soup.find(class_ ='wea')
tem = soup.find(class_ = 'tem')
return date,wea,tem
发送邮件¶
安全起见,程序我们需要自己输入发信人邮箱和密码,收件人邮箱和密码
程序使用qq邮箱,并开通邮箱中POP3/SMTP服务,一定记住授权码,并注意授权安全
import smtplib
from email.mime.text import MIMEText
from email.header import Header
from_addr = input('请输入发信人邮箱:')
print('\n')
password = input('请输入密码')
print('\n')
to_addr = input('请输入收信人邮箱')
print('\n')
def send_email(date,wea,tem):
global from_addr,password,to_addr
smtp_server = 'smtp.qq.com'
# 字符串strip方法:
# Strip()方法用于删除开始或结尾的字符。
# lstrip()|rstirp()分别从左右执行删除操作。
# 默认情况下会删除空白或者换行符,也可以指定其他字符。
text = '成都'+date.text+'\n天气:'+wea.text+'\n气温:'+tem.text.strip()
msg = MIMEText(text,'plain','utf-8')
msg['From'] = Header(from_addr)
msg['TO'] = Header(to_addr)
msg['Subject'] = Header('成都每日天气')
server = smtplib.SMTP_SSL(smtp_server)
server.connect(smtp_server,465)
server.login(from_addr,password)
try:
server.sendmail(from_addr, to_addr, msg.as_string())
print('邮件发送成功')
except:
print('邮件发送失败')
server.quit()
自动发送邮件¶
我们使用 schedule
包来完成自动发送邮件的任务。
import schedule
import time
def job():
date,wea,tem = get_weather()
# 爬去天气信息
send_email(date,wea,tem)
# 发送邮件
print('发送完成')
schedule.every().day.at("7:00").do(job)
# 设定每天7:00发送邮件,可以修改时间间隔
while True:
schedule.run_pending()
time.sleep(1)
最终程序¶
程序需要在开机电脑上持续运行才能每天可以发送邮件,因此如果希望程序可以脱离电脑运行,还需搭建在云服务器上!
import requests
from bs4 import BeautifulSoup
import smtplib
from email.mime.text import MIMEText
from email.header import Header
import schedule
import time
# ------------------------爬取天气信息----------------------------
def get_weather():
url = 'http://www.weather.com.cn/weather/101270101.shtml'
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
}
res = requests.get(url,headers=headers)
res.encoding='utf-8'
soup = BeautifulSoup(res.text,'html.parser')
date = soup.find(class_='t').find('h1')
wea = soup.find(class_ ='wea')
tem = soup.find(class_ = 'tem')
return date,wea,tem
# ------------------------发送邮件----------------------------
def send_email(date,wea,tem):
global from_addr,password,to_addr
smtp_server = 'smtp.qq.com'
# 字符串strip方法:
# Strip()方法用于删除开始或结尾的字符。
# lstrip()|rstirp()分别从左右执行删除操作。
# 默认情况下会删除空白或者换行符,也可以指定其他字符。
text = '成都'+date.text+'\n天气:'+wea.text+'\n气温:'+tem.text.strip()
msg = MIMEText(text,'plain','utf-8')
msg['From'] = Header(from_addr)
msg['TO'] = Header(to_addr)
msg['Subject'] = Header('成都每日天气')
server = smtplib.SMTP_SSL(smtp_server)
server.connect(smtp_server,465)
server.login(from_addr,password)
try:
server.sendmail(from_addr, to_addr, msg.as_string())
print('邮件发送成功')
except:
print('邮件发送失败')
server.quit()
from_addr = input('请输入发信人邮箱:')
print('\n')
password = input('请输入密码')
print('\n')
to_addr = input('请输入收信人邮箱')
print('\n')
# ------------------------定时----------------------------
def job():
date,wea,tem = get_weather()
# 爬去天气信息
send_email(date,wea,tem)
# 发送邮件
print('发送完成')
schedule.every().day.at("7:00").do(job)
# 设定每天7:00发送邮件,可以修改时间间隔
while True:
schedule.run_pending()
time.sleep(1)