语雀文档抓取工具
可以保存任意用户整个语雀知识库为Markdown格式 (包含完整目录结构和索引)
importsysimportosimportreimportjsonimportuuidimporttimeimporturllib.parsefrompathlibimportPathfromtypingimportDict,Tuple,ListimportrequestsfromrequestsimportResponse,exceptionsasreq_exc HEADERS={"User-Agent":("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ""AppleWebKit/537.36 (KHTML, like Gecko) ""Chrome/125.0.0.0 Safari/537.36")}TIMEOUT=10# 单次请求超时RETRY=3# 图片下载最大重试次数SLEEP_BETWEEN_RETRY=1# 重试间隔(秒)def_retry_get(url:str,max_retry:int=RETRY,**kwargs)->Tuple[bool,Response]:"""带重试的 GET 请求。"""forattemptinrange(1,max_retry+1):try:resp=requests.get(url,headers=HEADERS,timeout=TIMEOUT,stream=True)ifresp.status_code==200:returnTrue,respelse:print(f"[WARN] 下载失败(HTTP{resp.status_code}),第{attempt}/{max_retry}次重试:{url}")exceptreq_exc.RequestExceptionase:print(f"[WARN] 下载异常{e},第{attempt}/{max_retry}次重试:{url}")time.sleep(SLEEP_BETWEEN_RETRY)returnFalse,Nonedef_ensure_dir(path:Path)->None:"""确保目录存在。"""ifnotpath.exists():path.mkdir(parents=True,exist_ok=True)def_save_binary(resp:Response,dest:Path)->None:"""保存二进制文件。"""withdest.open("wb")asf:forchunkinresp.iter_content(chunk_size=8192):ifchunk:f.write(chunk)def_extract_images(md:str)->List[Tuple[str,str]]:"""提取 markdown 中的图片。"""pattern_md=re.compile(r'!\[([^\]]*)\]\((https?[^)]+)\)',re.IGNORECASE)pattern_html=re.compile(r'<img[^>]*?src=["\'](https?[^"\']+)["\']',re.IGNORECASE)images=pattern_md.findall(md)images+=[('',m)forminpattern_html.findall(md)]returnimagesdef_local_filename(url:str)->str:"""生成本地文件名。"""suffix=Path(urllib.parse.urlparse(url).path).suffix.lower()ifnotsuffixorlen(suffix)>6:suffix=".png"returnf"{uuid.uuid4().hex}{suffix}"defsave_page(book_id:str,slug:str,md_path:str)->None:""" 增加断点续传逻辑,文件存在则跳过。 """# 断点续传】如果文件已存在,直接跳过ifos.path.exists(md_path):print(f"[INFO] 文件已存在,跳过:{md_path}")returnurl=f"https://www.yuque.com/api/docs/{slug}?book_id={book_id}&merge_dynamic_data=false&mode=markdown"try:resp=requests.get(url,headers=HEADERS,timeout=TIMEOUT)ifresp.status_code!=200:print(f"[ERROR] 文档下载失败(状态码{resp.status_code}),可能已删除:{book_id}{slug}")returndoc_json=resp.json()md_content=doc_json["data"]["sourcecode"]except(req_exc.RequestException,KeyError,json.JSONDecodeError)ase:print(f"[ERROR] 文档解析失败:{e}|{book_id}{slug}")return# 处理图片images_dir=Path(md_path).parent/"images"images_map:Dict[str,str]={}foralt,img_urlin_extract_images(md_content):ifimg_urlinimages_map:continuelocal_name=_local_filename(img_url)local_rel_path=f"images/{local_name}"local_abs_path=images_dir/local_name# 只有确实需要下载图片时才创建目录_ensure_dir(images_dir)# 检查图片是否已存在(可选,防止重复下载图片)iflocal_abs_path.exists():images_map[img_url]=local_rel_pathcontinuesuccess,img_resp=_retry_get(img_url,RETRY)ifsuccess:try:_save_binary(img_resp,local_abs_path)images_map[img_url]=local_rel_path# print(f"[INFO] 图片已保存:{local_rel_path}") # 减少日志刷屏exceptExceptionase:print(f"[WARN] 图片保存失败{e},回退远程链接:{img_url}")images_map[img_url]=img_urlelse:images_map[img_url]=img_urlforremote,localinimages_map.items():md_content=md_content.replace(remote,local)# 写入文件try:Path(md_path).parent.mkdir(parents=True,exist_ok=True)withopen(md_path,"w",encoding="utf-8")asf_md:f_md.write(md_content)print(f"[INFO] 文档已保存:{md_path}")exceptOSErrorase:print(f"[ERROR] 保存文件失败(可能是文件名非法):{md_path}|{e}")# 在这更换网址defget_book(book_url:str="")->None:""" 抓取整本语雀知识库。 增强了文件名清洗逻辑,去除首尾空格。 """try:resp=requests.get(book_url,headers=HEADERS,timeout=TIMEOUT)resp.raise_for_status()exceptreq_exc.RequestExceptionase:print(f"[ERROR] 获取知识库失败:{e}")returnmatches=re.findall(r'decodeURIComponent\(\"(.+)\"\)\);',resp.text)ifnotmatches:print("[ERROR] 未找到知识库数据")returntry:docs_json=json.loads(urllib.parse.unquote(matches[0]))exceptjson.JSONDecodeErrorase:print(f"[ERROR] 解析知识库 JSON 失败:{e}")returnbook_id=str(docs_json["book"]["id"])book_root=Path("download")/book_id _ensure_dir(book_root)toc=docs_json["book"]["toc"]uuid_title_parent:Dict[str,Tuple[str,str]]={d["uuid"]:(d["title"],d["parent_uuid"])fordintoc}resolved_paths:Dict[str,str]={}# 构造层级路径,增加非法字符过滤,Windows下 ? 也是非法的trans_table=str.maketrans('\/:*?"<>|'+"\n\r","___________")defresolve_path(u:str)->str:ifuinresolved_paths:returnresolved_paths[u]# 处理可能缺失的情况ifunotinuuid_title_parent:return"Unknown"title,parent=uuid_title_parent[u]# 加上 .strip() 去除首尾空格!safe_title=title.translate(trans_table).strip()# 再次兜底,防止 strip 后为空ifnotsafe_title:safe_title="Untitled"ifnotparent:path_=safe_titleelse:path_=f"{resolve_path(parent)}/{safe_title}"resolved_paths[u]=path_returnpath_ summary_lines:List[str]=[]foritemintoc:path_rel=resolve_path(item["uuid"])abs_dir=book_root/path_rel is_dir=item["type"]=="TITLE"oritem.get("child_uuid")!=""ifis_dir:# 这里可能会因为路径太深或名字问题报错,加个trytry:_ensure_dir(abs_dir)exceptOSError:passheader_level=path_rel.count("/")+2summary_lines.append("#"*header_level+f"{path_rel.split('/')[-1]}")ifitem.get("url"):md_filename=f"{path_rel}.md"summary_indent=" "*path_rel.count("/")summary_lines.append(f"{summary_indent}* [{item['title']}]({urllib.parse.quote(md_filename)})")# 调用下载save_page(book_id,item["url"],str(book_root/f"{md_filename}"))withopen(book_root/"SUMMARY.md","w",encoding="utf-8")asf_sum:f_sum.write("\n".join(summary_lines))print(f"[INFO] SUMMARY.md 已生成:{book_root/'SUMMARY.md'}")if__name__=="__main__":iflen(sys.argv)>1:get_book(sys.argv[1])else:get_book("")执行python3 main.py 语雀文档地址
项目地址https://github.com/burpheart/yuque-crawl