简单爬虫
import requests
import re
import chardet# 模拟浏览器的请求头
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}# 发送 HTTP 请求获取百度首页内容
url = "https://www.163.com"
response = requests.get(url, headers=headers)# 自动检测编码
encoding = chardet.detect(response.content)["encoding"]
response.encoding = encoding# 检查请求是否成功
if response.status_code == 200:# 获取网页内容html_content = response.text# print(html_content)# 使用正则表达式提取标题title_match = re.search(r"<title>(.*?)</title>", html_content, re.IGNORECASE)if title_match:title = title_match.group(1)print(f"网页标题: {title}")else:print("未找到标题")
else:print(f"请求失败,状态码: {response.status_code}")