如何在Python解析本地 HTML 文件?
先决条件:Beautifulsoup
解析意味着将文件或输入分成可以存储供我们将来个人使用的信息/数据片段。有时,我们需要存储在我们计算机上的现有文件中的数据,在这种情况下可以使用解析技术。解析包括用于从文件中提取数据的多种技术。以下包括修改文件、从文件中删除某些内容、打印数据、使用递归子生成器方法从文件中遍历数据、查找标签的子项、从链接中抓取网页以提取有用信息等。
修改文件
使用prettify方法修改来自-https://festive-knuth-1279a2.netlify.app/的HTML代码,更好看。 Prettify使代码看起来像VS Code 中使用的标准形式。
例子:
Python3
# Importing BeautifulSoup class from the bs4 module
from bs4 import BeautifulSoup
# Importing the HTTP library
import requests as req
# Requesting for the website
Web = req.get('https://festive-knuth-1279a2.netlify.app/')
# Creating a BeautifulSoup object and specifying the parser
S = BeautifulSoup(Web.text, 'lxml')
# Using the prettify method
print(S.prettify())
Python3
# Importing BeautifulSoup class from the bs4 module
from bs4 import BeautifulSoup
# Opening the html file
HTMLFile = open("index.html", "r")
# Reading the file
index = HTMLFile.read()
# Creating a BeautifulSoup object and specifying the parser
S = BeautifulSoup(index, 'lxml')
# Using the select-one method to find the second element from the li tag
Tag = S.select_one('li:nth-of-type(2)')
# Using the decompose method
Tag.decompose()
# Using the prettify method to modify the code
print(S.body.prettify())
Python3
# Importing BeautifulSoup class from the bs4 module
from bs4 import BeautifulSoup
# Opening the html file
HTMLFile = open("index.html", "r")
# Reading the file
index = HTMLFile.read()
# Creating a BeautifulSoup object and specifying the parser
Parse = BeautifulSoup(index, 'lxml')
# Printing html code of some tags
print(Parse.head)
print(Parse.h1)
print(Parse.h2)
print(Parse.h3)
print(Parse.li)
Python3
# Importing BeautifulSoup class from the bs4 module
from bs4 import BeautifulSoup
# Opening the html file
HTMLFile = open("index.html", "r")
# Reading the file
index = HTMLFile.read()
# Creating a BeautifulSoup object and specifying the parser
S = BeautifulSoup(index, 'lxml')
# Using the recursiveChildGenerator method to traverse the html file
for TraverseTags in S.recursiveChildGenerator():
# Traversing the names of the tags
if TraverseTags.name:
# Printing the names of the tags
print(TraverseTags.name)
Python3
# Importing BeautifulSoup class from the bs4 module
from bs4 import BeautifulSoup
# Opening the html file
HTMLFile = open("index.html", "r")
# Reading the file
index = HTMLFile.read()
# Creating a BeautifulSoup object and specifying the parser
S = BeautifulSoup(index, 'lxml')
# Printing the Code, name, and text of a tag
print(f'HTML: {S.ul}, name: {S.ul.name}, text: {S.ul.text}')
Python3
# Importing BeautifulSoup class from the bs4 module
from bs4 import BeautifulSoup
# Opening the html file
HTMLFile = open("index.html", "r")
# Reading the file
index = HTMLFile.read()
# Creating a BeautifulSoup object and specifying the parser
S = BeautifulSoup(index, 'lxml')
# Providing the source
Attr = S.html
# Using the Children attribute to get the children of a tag
# Only contain tag names and not the spaces
Attr_Tag = [e.name for e in Attr.children if e.name is not None]
# Printing the children
print(Attr_Tag)
Python3
# Importing BeautifulSoup class from the bs4 module
from bs4 import BeautifulSoup
# Opening the html file
HTMLFile = open("index.html", "r")
# Reading the file
index = HTMLFile.read()
# Creating a BeautifulSoup object and specifying the parser
S = BeautifulSoup(index, 'lxml')
# Providing the source
Des = S.body
# Using the descendants attribute
Attr_Tag = [e.name for e in Des.descendants if e.name is not None]
# Printing the children
print(Attr_Tag)
Python3
# Importing BeautifulSoup class from the bs4 module
from bs4 import BeautifulSoup
# Opening the html file
HTMLFile = open("index.html", "r")
# Reading the file
index = HTMLFile.read()
# Creating a BeautifulSoup object and specifying the parser
S = BeautifulSoup(index, 'lxml')
# Using the find_all method to find all elements of a tag
for tag in S.find_all('p'):
# Printing the name, and text of p tag
print(f'{tag.name}: {tag.text}')
Python3
# Importing BeautifulSoup class from the bs4 module
from bs4 import BeautifulSoup
# Opening the html file
HTMLFile = open("index.html", "r")
# Reading the file
index = HTMLFile.read()
# Creating a BeautifulSoup object and specifying the parser
S = BeautifulSoup(index, 'lxml')
# Using the select method
# Prints the second element from the li tag
print(S.select('li:nth-of-type(2)'))
输出:
移除标签
可以通过使用分解方法和带有 CSS 选择器的 select_one 方法从 li 标记中选择并删除第二个元素,然后使用 prettify 方法修改 index.html 文件中的 HTML 代码来删除标记。
例子:
使用的文件:
蟒蛇3
# Importing BeautifulSoup class from the bs4 module
from bs4 import BeautifulSoup
# Opening the html file
HTMLFile = open("index.html", "r")
# Reading the file
index = HTMLFile.read()
# Creating a BeautifulSoup object and specifying the parser
S = BeautifulSoup(index, 'lxml')
# Using the select-one method to find the second element from the li tag
Tag = S.select_one('li:nth-of-type(2)')
# Using the decompose method
Tag.decompose()
# Using the prettify method to modify the code
print(S.body.prettify())
输出:
查找标签
标签可以正常找到并使用print()正常打印。
例子:
蟒蛇3
# Importing BeautifulSoup class from the bs4 module
from bs4 import BeautifulSoup
# Opening the html file
HTMLFile = open("index.html", "r")
# Reading the file
index = HTMLFile.read()
# Creating a BeautifulSoup object and specifying the parser
Parse = BeautifulSoup(index, 'lxml')
# Printing html code of some tags
print(Parse.head)
print(Parse.h1)
print(Parse.h2)
print(Parse.h3)
print(Parse.li)
输出:
遍历标签
recursiveChildGenerator方法用于遍历标签,从标签中递归查找标签内的所有标签 文件。
例子:
蟒蛇3
# Importing BeautifulSoup class from the bs4 module
from bs4 import BeautifulSoup
# Opening the html file
HTMLFile = open("index.html", "r")
# Reading the file
index = HTMLFile.read()
# Creating a BeautifulSoup object and specifying the parser
S = BeautifulSoup(index, 'lxml')
# Using the recursiveChildGenerator method to traverse the html file
for TraverseTags in S.recursiveChildGenerator():
# Traversing the names of the tags
if TraverseTags.name:
# Printing the names of the tags
print(TraverseTags.name)
输出:
解析标签的名称和文本属性
使用标签的name属性打印其名称,使用text属性打印其文本以及文件中的 tag- ul代码。
例子:
蟒蛇3
# Importing BeautifulSoup class from the bs4 module
from bs4 import BeautifulSoup
# Opening the html file
HTMLFile = open("index.html", "r")
# Reading the file
index = HTMLFile.read()
# Creating a BeautifulSoup object and specifying the parser
S = BeautifulSoup(index, 'lxml')
# Printing the Code, name, and text of a tag
print(f'HTML: {S.ul}, name: {S.ul.name}, text: {S.ul.text}')
输出:
寻找标签的孩子
Children属性用于获取标签的子级。 Children 属性在它们之间返回“带有空格的标签”,我们添加了一个条件 - e。 name is not None只打印文件中标签的名称。
例子:
蟒蛇3
# Importing BeautifulSoup class from the bs4 module
from bs4 import BeautifulSoup
# Opening the html file
HTMLFile = open("index.html", "r")
# Reading the file
index = HTMLFile.read()
# Creating a BeautifulSoup object and specifying the parser
S = BeautifulSoup(index, 'lxml')
# Providing the source
Attr = S.html
# Using the Children attribute to get the children of a tag
# Only contain tag names and not the spaces
Attr_Tag = [e.name for e in Attr.children if e.name is not None]
# Printing the children
print(Attr_Tag)
输出:
在标签的所有级别查找子项:
Descendants属性用于从文件中获取标签的所有后代(所有级别的子代)。
例子:
蟒蛇3
# Importing BeautifulSoup class from the bs4 module
from bs4 import BeautifulSoup
# Opening the html file
HTMLFile = open("index.html", "r")
# Reading the file
index = HTMLFile.read()
# Creating a BeautifulSoup object and specifying the parser
S = BeautifulSoup(index, 'lxml')
# Providing the source
Des = S.body
# Using the descendants attribute
Attr_Tag = [e.name for e in Des.descendants if e.name is not None]
# Printing the children
print(Attr_Tag)
输出:
查找标签的所有元素
使用 find_all():
find_all方法用于从文件中查找p标签内的所有元素( name和text )。
例子:
蟒蛇3
# Importing BeautifulSoup class from the bs4 module
from bs4 import BeautifulSoup
# Opening the html file
HTMLFile = open("index.html", "r")
# Reading the file
index = HTMLFile.read()
# Creating a BeautifulSoup object and specifying the parser
S = BeautifulSoup(index, 'lxml')
# Using the find_all method to find all elements of a tag
for tag in S.find_all('p'):
# Printing the name, and text of p tag
print(f'{tag.name}: {tag.text}')
输出:
用于查找元素的 CSS 选择器:
使用select方法使用CSS 选择器从文件中的li标签中查找第二个元素。
例子:
蟒蛇3
# Importing BeautifulSoup class from the bs4 module
from bs4 import BeautifulSoup
# Opening the html file
HTMLFile = open("index.html", "r")
# Reading the file
index = HTMLFile.read()
# Creating a BeautifulSoup object and specifying the parser
S = BeautifulSoup(index, 'lxml')
# Using the select method
# Prints the second element from the li tag
print(S.select('li:nth-of-type(2)'))
输出: