本文详细解释如何使用Python爬取网页数据,并提供带有中文注释的代码示例。我们将分步骤进行,涵盖从基本的网页爬取到处理动态加载内容的各个方面。
让我们开始吧:
首先,我们需要安装必要的Python库。
pip install requests beautifulsoup4 selenium
我们先从最基本的网页爬取开始。
import requests
from bs4 import BeautifulSoup
# 设置目标URL
url = "https://example.com"
# 设置请求头,模拟浏览器行为
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
# 发送GET请求
response = requests.get(url, headers=headers)
# 检查请求是否成功
if response.status_code == 200:
# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(response.text, 'html.parser')
# 提取标题
title = soup.title.string if soup.title else "No title found"
print(f"页面标题: {title}")
# 提取所有段落文本
paragraphs = soup.find_all('p')
for i, paragraph in enumerate(paragraphs, 1):
print(f"段落 {i}: {paragraph.text.strip()}")
# 提取所有链接
links = soup.find_all('a')
for i, link in enumerate(links, 1):
print(f"链接 {i}: {link.get('href')}")
else:
print(f"请求失败,状态码: {response.status_code}")
很多网站的内容是分页显示的。下面是如何处理分页列表的示例。
import requests
from bs4 import BeautifulSoup
def scrape_page(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return BeautifulSoup(response.text, 'html.parser')
return None
def extract_data(soup):
# 这里的选择器需要根据实际网页结构调整
items = soup.select('.item') # 假设每个列表项有一个'item'类
data = []
for item in items:
title = item.select_one('.title').text.strip()
link = item.select_one('a')['href']
data.append({'title': title, 'link': link})
return data
# 主函数
def main():
base_url = "https://example.com/list?page={}"
all_data = []
for page in range(1, 6): # 爬取前5页
url = base_url.format(page)
print(f"正在爬取第 {page} 页...")
soup = scrape_page(url)
if soup:
page_data = extract_data(soup)
all_data.extend(page_data)
else:
print(f"无法爬取第 {page} 页")
print(f"总共爬取了 {len(all_data)} 条数据")
# 这里可以添加数据保存逻辑,例如写入CSV文件
if __name__ == "__main__":
main()
下面是如何提取网页中的图片的示例。
import requests
from bs4 import BeautifulSoup
import os
def download_image(url, folder):
# 确保文件夹存在
if not os.path.exists(folder):
os.makedirs(folder)
# 获取文件名
filename = url.split("/")[-1]
filepath = os.path.join(folder, filename)
# 下载图片
response = requests.get(url)
if response.status_code == 200:
with open(filepath, 'wb') as f:
f.write(response.content)
print(f"下载成功: {filename}")
else:
print(f"下载失败: {url}")
def scrape_images(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
images = soup.find_all('img')
for img in images:
src = img.get('src')
if src:
if src.startswith('http'):
download_image(src, 'downloaded_images')
else:
# 处理相对URL
full_url = f"{url.rstrip('/')}/{src.lstrip('/')}"
download_image(full_url, 'downloaded_images')
# 使用示例
scrape_images("https://example.com")
很多现代网站使用AJAX动态加载内容。下面是如何处理AJAX请求的示例。
import requests
import json
def scrape_ajax_data(url, params):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"X-Requested-With": "XMLHttpRequest" # 有些网站会检查这个头
}
response = requests.get(url, headers=headers, params=params)
if response.status_code == 200:
try:
data = response.json()
return data
except json.JSONDecodeError:
print("返回的不是有效的JSON数据")
return None
else:
print(f"请求失败,状态码: {response.status_code}")
return None
# 使用示例
ajax_url = "https://example.com/api/data"
params = {
"page": 1,
"limit": 10
}
data = scrape_ajax_data(ajax_url, params)
if data:
# 处理返回的数据
for item in data.get('items', []):
print(f"标题: {item.get('title')}")
print(f"链接: {item.get('url')}")
print("---")
对于一些使用JavaScript渲染内容的网站,我们需要使用Selenium来模拟浏览器行为。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
# 设置Chrome驱动
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
# 访问网页
url = "https://example.com"
driver.get(url)
try:
# 等待特定元素加载完成
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "dynamic-content"))
)
# 现在可以提取动态加载的内容
dynamic_content = driver.find_elements(By.CLASS_NAME, "dynamic-content")
for item in dynamic_content:
print(item.text)
# 如果需要点击某个按钮来加载更多内容
load_more_button = driver.find_element(By.ID, "load-more")
load_more_button.click()
# 等待新内容加载
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "new-content"))
)
# 提取新加载的内容
new_content = driver.find_elements(By.CLASS_NAME, "new-content")
for item in new_content:
print(item.text)
finally:
driver.quit() # 关闭浏览器
这些示例涵盖了网页爬取的多个方面,包括基本的静态网页爬取、处理分页列表、下载图片、处理AJAX请求以及使用Selenium处理动态内容。每个示例都包含了详细的中文注释,解释了代码的功能和用途。
在实际应用中,您可能需要根据具体的网站结构和需求来调整这些代码。此外,请确保您的爬虫行为符合目标网站的使用条款和法律规定。如果网站提供了API,通常更推荐使用API而不是爬虫。