Source code for cnblogs_downloader

import json
import os
import re
import threading
from datetime import datetime

import httpx

import lib.cnblogs_api as api


[docs]class CnblogsDownloader: """ 下载器类,日志均print到控制台\n """ _FLAG_FILE_NAME = ".CnblogsDownloaderFlag.json" _IMG_PATTERN = re.compile(r'(!\[[^\]]*?\]\()([^\)]*/([^\)]*?))(\))|(<img[^>]*?src=")([^"]*/([^"]*?))("[^>]*?>)') """ 预先编译正则 called by :py:func:`CnblogsDownloader._download_replace_img`\n 此处正则中使用(?:)非捕获元无效 """
[docs] def __init__(self, cnblogs_cookie, workdir, download_img=False): """ 初始化下载器类 :param str cnblogs_cookie: 博客园Cookie ``.Cnblogs.AspNetCore.Cookies`` 的值 :param str workdir: 工作目录,即下载目录 :param bool download_img: 是否离线随笔中引用的图片 """ self._total_essay = 0 self._updated_essay = 0 self._is_first_run = True self._last_update = None self._workdir = workdir self._download_img = download_img self._lock = threading.Lock() self._http_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/99.0.4844.74 Safari/537.36", "Referer": "https://i.cnblogs.com/", "Cookie": rf".Cnblogs.AspNetCore.Cookies={cnblogs_cookie}"} self._category = api.get_category_list(self._http_headers) flag_path = rf"{workdir}\{self._FLAG_FILE_NAME}" if os.path.isfile(flag_path): self._is_first_run = False flag = None with open(flag_path, "r", encoding="utf-8") as f: flag = json.load(f) pass # download_to_subdir最后还有写入操作 last_update = flag["last_update"] self._last_update = datetime.strptime(last_update, "%Y-%m-%dT%H:%M:%S")
[docs] def download_to_subdir(self): """ 开始下载\n 主函数,多线程下载随笔及图片,线程数为随笔的分类数 :rtype: int :return: 更新的随笔数量 """ current_path = os.getcwd() os.chdir(self._workdir) self._category.append({"categoryId": 0, "title": "未分类"}) download_threads = [] for category in self._category: # (category,) 一个元素的元组 (category)是列表转元组 download_thread = threading.Thread(target=self._category_download_thread, args=(category,)) download_thread.start() download_threads.append(download_thread) for download_thread in download_threads: download_thread.join() print(rf"总共{self._total_essay}篇随笔,更新了{self._updated_essay}篇") now = datetime.now().strftime("%Y-%m-%dT%H:%M:%S") with open(rf"{self._workdir}\{self._FLAG_FILE_NAME}", "w", encoding="utf-8") as f: f.write(rf'{{"last_update": "{now}"}}') os.chdir(current_path) return self._updated_essay
[docs] def _category_download_thread(self, category): """ 每个分类一个线程去下载随笔内容,需要的话,还能下载图片\n 多线程共用一个工作目录,任何一个线程os.chdir都会改变整个程序的工作目录 :param dict category: 分类的基本信息 """ dirname = category["title"] dirname = re.sub(rf'(\\|/|\?|\||"|:|\*|<|>)', " ", dirname) if not os.path.isdir(dirname): os.mkdir(dirname) write_absolute_path = rf"{self._workdir}\{dirname}" essays = api.get_posts_list(self._http_headers, category_id=str(category["categoryId"])) self._lock.acquire() self._total_essay = self._total_essay + essays["postsCount"] self._lock.release() for essay_pre in essays["postList"]: filename = essay_pre["title"] # 替换特殊字符,Windows文件名不允许出现特殊字符: \/:*?"<>| filename = re.sub(rf'(\\|/|\?|\||"|:|\*|<|>)', " ", filename) filename = rf'{filename}{"[非公开]" if not essay_pre["isPublished"] else ""}' \ rf'{"[草稿]" if essay_pre["isDraft"] else ""}.md' essay_date_updated = datetime.strptime(essay_pre["dateUpdated"], "%Y-%m-%dT%H:%M:%S") if (not self._is_first_run) and os.path.isfile(rf"{write_absolute_path}\{filename}") and \ (self._last_update - essay_date_updated).total_seconds() > 0: self._lock.acquire() print(rf"已是最新:{dirname}\{filename}") self._lock.release() continue essay = api.get_post_by_id(self._http_headers, str(essay_pre["id"])) essay_content = essay["blogPost"]["postBody"] if self._download_img: essay_content = CnblogsDownloader._download_replace_img(filename, essay_content, write_absolute_path) with open(rf"{write_absolute_path}\{filename}", "w", encoding="utf-8") as f: f.write(essay_content) self._lock.acquire() self._updated_essay = self._updated_essay + 1 print(rf"已下载随笔:{dirname}\{filename}") self._lock.release()
[docs] @staticmethod def _download_replace_img(essay_title, essay_content, workdir): """ 替换文章内容中的图片,包括 ``![]()`` 和 ``<img src="xx" style="height:450px">`` 的格式\n img标签中其他属性也会被保留,比如替换后为 ``<img src="./img/xx" style="height:450px">`` \n 最后根据图片链接下载图片 :rtype: str :return: 完成替换后的文章内容 """ img_url = [] # bug:写成lambda表达式用or连接两句时,只会执行最后一个表达式,猜测是因为前面的语句没有返回值 def replace(m): img_url.append(m.group(2) if m.group(2) else m.group(6)) return rf"{m.group(1)}./img/{m.group(3)}{m.group(4)}" if m.group( 3) else rf"{m.group(5)}./img/{m.group(7)}{m.group(8)}" essay_content = CnblogsDownloader._IMG_PATTERN.sub(replace, essay_content) http_headers = {"Referer": "https://i.cnblogs.com/"} if len(img_url) > 0 and (not os.path.isdir(rf"{workdir}\img")): os.mkdir(rf"{workdir}\img") for url in img_url: # 不再校验文件名的合法性 img_name = url.split("/")[-1] img_path = rf"{workdir}/img/{img_name}" if os.path.isfile(img_path): print(rf"图片已存在:{img_name}") continue try: r = httpx.get(url, headers=http_headers, timeout=api.TIMEOUT) with open(img_path, "wb") as f: f.write(r.content) print(rf"已为《{essay_title}》下载图片:{img_name}") except Exception as e: print(f"error: 为《{essay_title}》下载图片失败,链接:{url}") return essay_content