BeautifulSoup是一个用于解析html和XML文档的Python库,它能帮助我们从网页中提取需要的数据。这个库特别适合网页抓取和数据挖掘任务。
使用pip安装BeautifulSoup和相关库:
pip install beautifulsoup4
pip install lxml
pip install requestsbeautifulsoup4:主库
lxml:解析器,速度较快
requests:用于获取网页内容
from bs4 import BeautifulSoup
import requests
# 获取网页内容
url = 'https://www.example.com'
response = requests.get(url)
# 检查请求是否成功
if response.status_code == 200:
# 使用BeautifulSoup解析网页
soup = BeautifulSoup(response.text, 'lxml')
print("网页解析成功")
else:
print("网页请求失败")中文网页可能出现乱码,需要正确设置编码:
from bs4 import BeautifulSoup
import requests
url = 'https://www.baidu.com'
response = requests.get(url)
# 设置编码为utf-8
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
# 获取网页标题
title = soup.find('title')
if title:
print("网页标题:", title.get_text())from bs4 import BeautifulSoup
import requests
url = 'https://www.baidu.com'
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
# 查找第一个div元素
first_div = soup.find('div')
print("第一个div:", first_div)
# 查找特定id的元素
search_box = soup.find(id='kw')
print("搜索框:", search_box)
# 查找特定class的元素
logo = soup.find(class_='index-logo')
print("Logo元素:", logo)from bs4 import BeautifulSoup
import requests
url = 'https://www.baidu.com'
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
# 查找所有链接
all_links = soup.find_all('a')
print(f"找到 {len(all_links)} 个链接")
# 显示前5个链接
for i, link in enumerate(all_links[:5]):
href = link.get('href')
text = link.get_text().strip()
print(f"链接 {i+1}: {text} -> {href}")
# 查找所有图片
images = soup.find_all('img')
print(f"找到 {len(images)} 张图片")from bs4 import BeautifulSoup
import requests
url = 'https://www.baidu.com'
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
# 获取元素的文本内容
first_link = soup.find('a')
if first_link:
link_text = first_link.get_text()
print("链接文本:", link_text)
# 获取所有文本(去除HTML标签)
all_text = soup.get_text()
print("页面文本内容(前200字符):", all_text[:200])from bs4 import BeautifulSoup
import requests
url = 'https://www.baidu.com'
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
# 查找搜索按钮
search_button = soup.find('input', id='su')
if search_button:
# 获取value属性
button_value = search_button.get('value')
print("按钮文字:", button_value)
# 获取所有属性
all_attributes = search_button.attrs
print("所有属性:", all_attributes)
# 获取链接的href属性
first_link = soup.find('a')
if first_link:
href = first_link.get('href')
print("链接地址:", href)from bs4 import BeautifulSoup
import requests
url = 'https://www.baidu.com'
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
# 使用CSS选择器查找元素
# 查找所有class为mnav的链接
nav_links = soup.select('a.mnav')
for link in nav_links:
print("导航链接:", link.get_text())
# 查找特定id的元素
search_box = soup.select('#kw')
if search_box:
print("找到搜索框")
# 查找嵌套元素
nested_elements = soup.select('div.head div.nav')from bs4 import BeautifulSoup
import requests
url = 'https://www.baidu.com'
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
# 查找包含特定属性的元素
images_with_src = soup.find_all('img', src=True)
print(f"找到 {len(images_with_src)} 个有src属性的图片")
# 查找特定属性的元素
links_with_href = soup.find_all('a', href=True)
for link in links_with_href[:3]:
print(f"链接: {link.get('href')}")
# 使用字典查找多个属性
specific_elements = soup.find_all('input', {'type': 'text', 'class': 's_ipt'})from bs4 import BeautifulSoup
import requests
url = 'https://www.baidu.com'
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
# 查找第一个链接
first_link = soup.find('a')
if first_link:
# 获取父元素
parent = first_link.parent
print("父元素标签:", parent.name)
# 获取所有子元素
print("子元素:")
for child in parent.children:
if child.name: # 只显示标签元素,跳过文本节点
print(f" - {child.name}")
# 获取兄弟元素
if first_link:
# 下一个兄弟元素
next_sibling = first_link.find_next_sibling()
if next_sibling:
print("下一个兄弟:", next_sibling.name)from bs4 import BeautifulSoup
import requests
def get_news_titles(url):
"""从新闻网站提取标题"""
try:
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
# 假设新闻标题在h2标签中
titles = soup.find_all('h2')
news_list = []
for title in titles:
text = title.get_text().strip()
if text: # 跳过空标题
news_list.append(text)
return news_list
except Exception as e:
print(f"获取新闻失败: {e}")
return []
# 使用示例
# news_url = "https://news.example.com"
# titles = get_news_titles(news_url)
# for i, title in enumerate(titles[:5], 1):
# print(f"{i}. {title}")from bs4 import BeautifulSoup
import requests
def extract_product_info(url):
"""提取电商网站产品信息"""
try:
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
products = []
# 假设产品信息在特定class的div中
product_elements = soup.find_all('div', class_='product')
for product in product_elements:
# 提取产品名称
name_elem = product.find('h3')
name = name_elem.get_text().strip() if name_elem else "未知"
# 提取价格
price_elem = product.find('span', class_='price')
price = price_elem.get_text().strip() if price_elem else "未知"
# 提取图片链接
img_elem = product.find('img')
image_url = img_elem.get('src') if img_elem else "无图片"
products.append({
'name': name,
'price': price,
'image_url': image_url
})
return products
except Exception as e:
print(f"提取产品信息失败: {e}")
return []
# 使用示例
# products = extract_product_info("https://shop.example.com")
# for product in products:
# print(f"产品: {product['name']}, 价格: {product['price']}")from bs4 import BeautifulSoup
import requests
import pandas as pd
def extract_table_data(url, table_class=None):
"""提取网页表格数据"""
try:
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
# 查找表格
if table_class:
tables = soup.find_all('table', class_=table_class)
else:
tables = soup.find_all('table')
all_data = []
for table in tables:
# 提取表头
headers = []
header_row = table.find('tr')
if header_row:
header_cells = header_row.find_all(['th', 'td'])
headers = [cell.get_text().strip() for cell in header_cells]
# 提取数据行
rows = table.find_all('tr')[1:] # 跳过表头行
for row in rows:
cells = row.find_all(['td', 'th'])
row_data = [cell.get_text().strip() for cell in cells]
if row_data:
all_data.append(row_data)
return headers, all_data
except Exception as e:
print(f"提取表格数据失败: {e}")
return [], []
# 使用示例
# headers, data = extract_table_data("https://data.example.com")
# print("表头:", headers)
# for row in data[:3]:
# print("数据行:", row)from bs4 import BeautifulSoup
import requests
def safe_extract(soup, selector, default="未找到"):
"""安全地提取元素内容"""
try:
element = soup.select_one(selector)
if element:
return element.get_text().strip()
return default
except Exception as e:
print(f"提取失败: {e}")
return default
def robust_web_scraping(url):
"""健壮的网页抓取函数"""
try:
# 设置请求头,模拟浏览器
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status() # 检查请求是否成功
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
# 安全地提取各种信息
title = safe_extract(soup, 'title')
first_heading = safe_extract(soup, 'h1')
print(f"标题: {title}")
print(f"主标题: {first_heading}")
return soup
except requests.exceptions.RequestException as e:
print(f"网络请求错误: {e}")
return None
except Exception as e:
print(f"其他错误: {e}")
return None
# 使用示例
# soup = robust_web_scraping("https://www.example.com")import json
import csv
from bs4 import BeautifulSoup
import requests
def save_extracted_data(url, output_format='json'):
"""提取数据并保存到文件"""
try:
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
# 提取数据(这里以链接为例)
links = soup.find_all('a', href=True)
link_data = []
for link in links:
text = link.get_text().strip()
href = link.get('href')
if text and href:
link_data.append({
'text': text,
'url': href
})
# 根据格式保存
if output_format == 'json':
with open('links.json', 'w', encoding='utf-8') as f:
json.dump(link_data, f, ensure_ascii=False, indent=2)
print("数据已保存到 links.json")
elif output_format == 'csv':
with open('links.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['链接文本', '链接地址'])
for item in link_data:
writer.writerow([item['text'], item['url']])
print("数据已保存到 links.csv")
return link_data
except Exception as e:
print(f"保存数据失败: {e}")
return []
# 使用示例
# data = save_extracted_data("https://www.example.com", 'json')from bs4 import BeautifulSoup
import requests
import time
def batch_scrape_urls(url_list, delay=1):
"""批量抓取多个URL"""
all_data = []
for url in url_list:
try:
print(f"正在处理: {url}")
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
# 提取需要的数据
title = soup.find('title')
title_text = title.get_text() if title else "无标题"
all_data.append({
'url': url,
'title': title_text
})
# 延迟,避免请求过快
time.sleep(delay)
except Exception as e:
print(f"处理 {url} 时出错: {e}")
continue
return all_data
# 使用示例
# urls = [
# "https://www.example.com/page1",
# "https://www.example.com/page2",
# "https://www.example.com/page3"
# ]
# results = batch_scrape_urls(urls)
# for result in results:
# print(f"URL: {result['url']}, 标题: {result['title']}")遵守robots.txt:检查网站的robots.txt文件,尊重网站的抓取规则
设置延迟:在请求之间添加延迟,避免给网站造成压力
处理异常:网络请求可能失败,要添加适当的错误处理
尊重版权:只抓取允许抓取的内容,尊重数据版权
使用代理:如果需要大量抓取,考虑使用代理IP
BeautifulSoup是一个强大的网页数据提取工具,主要功能包括:
解析HTML和XML文档
查找和提取特定元素
获取元素属性和文本内容
导航文档结构
使用BeautifulSoup时要注意:
选择合适的解析器(lxml速度较快)
正确处理编码问题
使用try-except处理可能的异常
遵守网站的使用条款
记住,网页抓取要合法合规,尊重网站的资源和使用规则。
本文内容仅供个人学习/研究/参考使用,不构成任何决策建议或专业指导。分享/转载时请标明原文来源,同时请勿将内容用于商业售卖、虚假宣传等非学习用途哦~感谢您的理解与支持!