1.准备环境
!pip install -U crawl4ai
!pip install nest_asynciocrawl4ai-setup
验证是否安装成功
# Check crawl4ai version
import crawl4ai
print(crawl4ai.__version__.__version__)
验证是否可以爬
crawl4ai-doctor
2.简单示例
import asyncio
from playwright.async_api import async_playwrightasync def test_browser():async with async_playwright() as p:browser = await p.chromium.launch(headless=True)page = await browser.new_page()await page.goto('https://example.com')print(f'Title: {await page.title()}')await browser.close()asyncio.run(test_browser())
3.简单示例2
import asyncio
import nest_asyncio
nest_asyncio.apply()from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig, CacheModeasync def simple_crawl():crawler_run_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS)async with AsyncWebCrawler() as crawler:result = await crawler.arun(url="https://www.kidocode.com/degrees/technology",config=crawler_run_config)print(result.markdown.raw_markdown[:500].replace("\n", " -- ")) # Print the first 500 charactersasyncio.run(simple_crawl())
4.动态内容抓取(模拟点击“Load More”按钮加载更多内容,再提取 Markdown 文本)
async def crawl_dynamic_content():# You can use wait_for to wait for a condition to be met before returning the result# wait_for = """() => {# return Array.from(document.querySelectorAll('article.tease-card')).length > 10;# }"""# wait_for can be also just a css selector# wait_for = "article.tease-card:nth-child(10)"async with AsyncWebCrawler() as crawler:js_code = ["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"]config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED,js_code=js_code,# wait_for=wait_for,)result = await crawler.arun(url="https://www.nbcnews.com/business",config=config,)print(result.markdown_v2.raw_markdown[:500].replace("\n", " -- ")) # Print first 500 charactersasyncio.run(crawl_dynamic_content())
5.内容过滤与 Markdown 精炼
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGeneratorasync def clean_content():async with AsyncWebCrawler(verbose=True) as crawler:config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED,excluded_tags=['nav', 'footer', 'aside'],remove_overlay_elements=True,markdown_generator=DefaultMarkdownGenerator(content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0),options={"ignore_links": True}),)result = await crawler.arun(url="https://en.wikipedia.org/wiki/Apple",config=config,)full_markdown_length = len(result.markdown_v2.raw_markdown)fit_markdown_length = len(result.markdown_v2.fit_markdown)print(f"Full Markdown Length: {full_markdown_length}")print(f"Fit Markdown Length: {fit_markdown_length}")asyncio.run(clean_content())
6.链接智能提取(过滤外部和社交媒体链接;提取页面中的站内链接并打印前 5 条(链接地址+文字)。
async def link_analysis():async with AsyncWebCrawler() as crawler:config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED,exclude_external_links=True,exclude_social_media_links=True,# exclude_domains=["facebook.com", "twitter.com"])result = await crawler.arun(url="https://www.nbcnews.com/business",config=config,)print(f"Found {len(result.links['internal'])} internal links")print(f"Found {len(result.links['external'])} external links")for link in result.links['internal'][:5]:print(f"Href: {link['href']}\nText: {link['text']}\n")asyncio.run(link_analysis())
7.图片提取
async def media_handling():async with AsyncWebCrawler() as crawler:config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED,exclude_external_images=False,# screenshot=True # Set this to True if you want to take a screenshot)result = await crawler.arun(url="https://www.nbcnews.com/business",config=config,)for img in result.media['images'][:5]:print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")asyncio.run(media_handling())
8.使用 hook 自定义浏览器行为(方便克制反爬虫)
import asyncio
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig, BrowserConfig
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from playwright.async_api import Page, BrowserContext # ✅ 补充导入类型# 定义 before_goto hook
async def before_goto(page: Page, context: BrowserContext, url: str, **kwargs):print(f"[HOOK] before_goto - About to visit: {url}")await page.set_extra_http_headers({"Custom-Header": "my-value"})return page# 主爬虫执行逻辑
async def custom_hook_workflow(verbose=True):async with AsyncWebCrawler(config=BrowserConfig(verbose=verbose)) as crawler:crawler.crawler_strategy.set_hook("before_goto", before_goto)result = await crawler.arun(url="https://crawl4ai.com",config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS))print(result.markdown.raw_markdown[:500].replace("\n", " -- "))asyncio.run(custom_hook_workflow())
Hook 名称 触发时机 用途示例 on_browser_created
浏览器实例被创建时 初始化日志、调试信息打印 before_goto
页面导航之前(尚未跳转) 添加请求头、自定义 header/cookie after_goto
页面加载完成之后 打印访问日志、调试跳转情况 on_execution_started
页面 JS 开始执行前 打印 JS 交互启动提示 before_return_html
HTML 提取返回前(页面爬取完成) 做 DOM 二次处理、页面校验
9. 多页爬取保持 session(分页抓取)
from crawl4ai.extraction_strategy import (JsonCssExtractionStrategy,LLMExtractionStrategy,
)
import jsonasync def crawl_dynamic_content_pages_method_2():print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")async with AsyncWebCrawler() as crawler:url = "https://github.com/microsoft/TypeScript/commits/main"session_id = "typescript_commits_session"all_commits = []last_commit = ""js_next_page_and_wait = """(async () => {const getCurrentCommit = () => {const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');return commits.length > 0 ? commits[0].textContent.trim() : null;};const initialCommit = getCurrentCommit();const button = document.querySelector('a[data-testid="pagination-next-button"]');if (button) button.click();// Poll for changeswhile (true) {await new Promise(resolve => setTimeout(resolve, 100)); // Wait 100msconst newCommit = getCurrentCommit();if (newCommit && newCommit !== initialCommit) {break;}}})();"""schema = {"name": "Commit Extractor","baseSelector": "li.Box-sc-g0xbh4-0","fields": [{"name": "title","selector": "h4.markdown-title","type": "text","transform": "strip",},],}extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)for page in range(2): # Crawl 2 pagesconfig = CrawlerRunConfig(cache_mode=CacheMode.BYPASS,session_id=session_id,css_selector="li.Box-sc-g0xbh4-0",extraction_strategy=extraction_strategy,js_code=js_next_page_and_wait if page > 0 else None,js_only=page > 0,)result = await crawler.arun(url=url,config=config)assert result.success, f"Failed to crawl page {page + 1}"commits = json.loads(result.extracted_content)all_commits.extend(commits)print(f"Page {page + 1}: Found {len(commits)} commits")await crawler.crawler_strategy.kill_session(session_id)print(f"Successfully crawled {len(all_commits)} commits across 3 pages")asyncio.run(crawl_dynamic_content_pages_method_2())
10.提取结构化数据(非 LLM)
import json
import asyncio
import nest_asyncio
nest_asyncio.apply()from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategyasync def extract():# 定义页面结构 schemaschema = {"name": "KidoCode Courses","baseSelector": "section.charge-methodology .div-block-214.p-extraxx","fields": [{"name": "section_title","selector": "h3.heading-50","type": "text",},{"name": "section_description","selector": ".charge-content","type": "text",},{"name": "course_name","selector": ".text-block-93","type": "text",},{"name": "course_description","selector": ".course-content-text","type": "text",},{"name": "course_icon","selector": ".image-92","type": "attribute","attribute": "src"}]}extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)# 点击每个 tab 页的 JS 脚本js_click_tabs = """(async () => {const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");for(let tab of tabs) {tab.scrollIntoView();tab.click();await new Promise(r => setTimeout(r, 500));}})();"""async with AsyncWebCrawler() as crawler:config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS,extraction_strategy=extraction_strategy,js_code=[js_click_tabs],)result = await crawler.arun(url="https://www.kidocode.com/degrees/technology",config=config)# 处理结构化内容courses = json.loads(result.extracted_content)print(result.extracted_content)print(f"✅ Successfully extracted {len(courses)} courses")print(f"📄 Markdown Length: {len(result.markdown.raw_markdown)}")# 正确调用异步主函数
asyncio.run(extract())
11.使用大模型抽取结构化信息
import asyncio
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
from openai import OpenAI
import json
from typing import List, Dictclient = OpenAI(base_url="https://openrouter.ai/api/v1",api_key=""
)async def fetch_markdown():async with AsyncWebCrawler(verbose=True) as crawler:result = await crawler.arun(url="https://openai.com/api/pricing/",config=CrawlerRunConfig(cache_mode=CacheMode.ENABLED))markdown = result.markdown.raw_markdownreturn markdownmarkdown_text = asyncio.run(fetch_markdown())
def llm_extract_markdown(markdown_text: str) -> List[Dict]:"""使用 Gemini 模型(OpenRouter 接入)从 markdown 中提取结构化模型价格信息"""prompt = f"""
以下是来自 OpenAI 官网的 API 定价页面内容(markdown 格式):
--------------------
{markdown_text}
--------------------请提取所有模型的价格信息,格式为 JSON 列表,每项包含:
- model_name
- input_fee
- output_fee返回格式如下:
[{{"model_name": "GPT-4o","input_fee": "$2.50 / 1M tokens","output_fee": "$10.00 / 1M tokens"}},...
]
如果某模型缺失 input/output,可写为空字符串。
只返回 JSON 内容,不要任何解释说明。
"""# ✅ 发起 Gemini 请求(OpenRouter)completion = client.chat.completions.create(model="google/gemini-2.5-pro-exp-03-25:free",extra_headers={"HTTP-Referer": "https://yourdomain.com", # 可选,利于排行榜"X-Title": "LLM Extractor Example" # 可选,用于 OpenRouter 统计},messages=[{"role": "user","content": prompt}])content = completion.choices[0].message.content.strip()try:return json.loads(content)except json.JSONDecodeError:print("❌ Gemini 返回内容不是合法 JSON:\n")print(content)return []result = llm_extract_markdown(markdown_text)print("\n✅ 提取结果:")
for item in result:print(item)