本文共 3029 字,大约阅读时间需要 10 分钟。
学习Python爬虫过程中的心得体会以及知识点的整理,方便我自己查找,也希望可以和大家一起交流。
from bs4 import BeautifulSoupimport bs4import re# 待分析字符串html_doc = """The Dormouse's story The Dormouse's story
Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.
...
"""# 每一段代码中注释部分即为运行结果# html字符串创建BeautifulSoup对象soup = BeautifulSoup(html_doc, 'html.parser', from_encoding='utf-8')# 输出第一个 title 标签print(soup.title)#The Dormouse's story # 输出第一个 title 标签的标签名称print(soup.title.name)# title# 输出第一个 title 标签的包含内容print(soup.title.string)# The Dormouse's story# 输出第一个 title 标签的父标签的标签名称print(soup.title.parent.name)# head# 输出第一个 p 标签print(soup.p)"""The Dormouse's story
"""# 输出第一个 p 标签的 class 属性内容print(soup.p['class'])# ['title', 'aq']# 输出第一个 a 标签的 href 属性内容print(soup.a['href'])# http://example.com/elsie''''' soup的属性可以被添加,删除或修改. 操作方法与字典一样 '''# 修改第一个 a 标签的href属性为 http://www.baidu.com/# soup.a['href'] = 'http://www.baidu.com/'# 给第一个 a 标签添加 name 属性# soup.a['name'] = u'百度'# 删除第一个 a 标签的 class 属性为# del soup.a['class']##输出第一个 p 标签的所有子节点print(soup.p.contents)"""['\n', The Dormouse's story , '\n']"""# 输出第一个 a 标签print(soup.a)# Elsie# 输出所有的 a 标签,以列表形式显示print(soup.find_all('a'))"""[Elsie, Lacie, Tillie]"""# 输出第一个 id 属性等于 link3 的 a 标签print(soup.find(id="link3"))# Tillie# 获取所有文字内容print(soup.get_text())"""The Dormouse's story The Dormouse's story Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well. ..."""# 输出第一个 a 标签的所有属性信息print(soup.a.attrs)# {'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'}for link in soup.find_all('a'): # 获取 link 的 href 属性内容 print(link.get('href'))""" http://example.com/elsie http://example.com/lacie http://example.com/tillie"""# 对soup.p的子节点进行循环输出for child in soup.p.children: print("对soup.p的子节点进行循环输出", child)"""对soup.p的子节点进行循环输出 对soup.p的子节点进行循环输出 The Dormouse's story 对soup.p的子节点进行循环输出 """# 正则匹配,名字中带有b的标签for tag in soup.find_all(re.compile(r"b")): print(tag.name)"""bodyb"""
import requestsfrom bs4 import BeautifulSoupPATH = "D:/MyProject/image/"def getimg(img_url): img_name = img_url[img_url.rfind('/')+1:] file = PATH + img_name r = requests.get(img_url, "html.parser") con = r.content o = open(file, 'wb') o.write(con) o.close() return filedef main(url): r = requests.get(url, "html.parser") soup = BeautifulSoup(r.content,"lxml") imgs = soup.find_all('img') for img in imgs: img_url = img['src'] print (img_url) img['src'] = getimg(img_url) o = open("D:/MyProject/image/test.html",'wb') o.write(str(soup)) o.close()if __name__ == "__main__": url = "https://www.freebuf.com/" main(url)
转载地址:http://ubazi.baihongyu.com/