📅  最后修改于: 2023-12-03 15:19:02.939000             🧑  作者: Mango
这是一篇介绍使用 Python 爬虫获取网页数据并统计最常用单词的教程,使用的工具是 Python3 和 requests、BeautifulSoup、Counter 等第三方库。
import requests
from bs4 import BeautifulSoup
from collections import Counter
def get_text(url):
"""
获取网页文本内容
"""
res = requests.get(url)
html = res.text
soup = BeautifulSoup(html, 'html.parser')
text = soup.get_text()
return text
def clean_text(text):
"""
文本分词并去除无用字符
"""
words = text.lower().split()
cleaned_words = []
for word in words:
symbols = '!@#$%^&*()_-+={[}]|\;:"<>/.,?~`'
for i in range(len(symbols)):
word = word.replace(symbols[i], '')
if len(word) > 0:
cleaned_words.append(word)
return cleaned_words
def count_words(cleaned_words, num):
"""
统计单词出现次数,并返回出现次数最多的 num 个单词
"""
word_counts = Counter(cleaned_words)
return word_counts.most_common(num)
if __name__ == '__main__':
url = 'https://www.baidu.com'
text = get_text(url)
cleaned_words = clean_text(text)
result = count_words(cleaned_words, 10)
print(result)
代码说明: