声明:基本步骤和核心方法均参考[1],[2],细节有大不同,这篇着重于如何将抓取到的文章用markdown的格式保存下来。
1. 上篇中已经抓到了公众号的所有文章的题目和链接,下篇直接读入:
import pandas as pd
from newspaper import Article
import html2text as ht
import yaml
import time
import requests
from bs4 import BeautifulSoup
import os
import tomd
import re
with open("chaos-gravity.yaml", "r") as file:
    file_data = file.read()
config = yaml.safe_load(file_data)
def get_headers(config):
    headers = {
        "Cookie": config['cookie'],
        "User-Agent": config['user-agent']
    }
    return headers
pd.set_option('max_colwidth', 1000)
article_list = pd.read_csv('article_list2.csv', encoding='GB18030')
headers = get_headers(config)
    
def download_images(soup, content):
    dir_name = "assets/images/" + str(filename[:-3]) + "/"
    if not os.path.isdir(dir_name):
        os.mkdir(dir_name)
    cnt = 1
    images = content.find_all("img")
    for image in images:
        img_src = image.get('data-src')
        img_type = image.get('data-type')
        img_name = "{0:d}.{1:s}".format(cnt, img_type if img_type else 'png')
        cnt += 1
        file_path = "assets/images/{0:s}/{1:s}".format(filename[:-3], img_name)
        if not os.path.exists(file_path):
            with open (file_path, 'wb') as file:
                response = requests.get(url = img_src)
                for block in response.iter_content(1024):
                    if block:
                        file.write(block)
                    else:
                        break
        tag = soup.new_tag('span')
        tag.string = "".format(filename[:-3], img_name)
        image.replace_with(tag)
        
def download_videos(soup, content):
    dir_name = "assets/videos/" + str(title) + "/"
    if not os.path.isdir(dir_name):
        os.mkdir(dir_name)
    if videos:
        for video in videos:
            video_src = video.get('data-src')
            tag = soup.new_tag('span')
            tag.string = "[此处是视频]({0:s})".format(video_src)
            video.replace_with(tag)
def for_list(content):
    for a in content.find_all("ol"):
        i = 1
        for b in a.find_all("li"):
            tag = soup.new_tag('span')
            tag.string = "{0:d}. {1:s}\n".format(i, b.text)
            b.replace_with(tag)
            i = i+1
def for_code2(content):
    for a in content.find_all(class_="code-snippet__fix code-snippet__js"):
        for b in a.find_all("code"):
            tag = soup.new_tag('code')
            tag.string = "\n" + b.text
            b.replace_with(tag)            
def for_code(content):
    for a in content.find_all(class_="code-snippet__fix code-snippet__js"):
        tag = soup.new_tag('code')
        string = ""
        length = len(a.find_all("code"))
        i = 1
        for b in a.find_all("code"):
            string = string + "\n" + b.text
            if i<length:
                b.extract()
            i = i+1
        tag.string = string
        b.replace_with(tag)
for index, row in article_list.iterrows():
    print(index)
    print(row)
    create_time = time.strftime("%Y-%m-%d",time.localtime(row["create_time"]))
    url = row['link'].replace("\/","/")
    article = requests.get(url,headers=headers)
    if article.status_code == 200:
        html = article.text
    # for_code 
    html = html.replace("<br  />","\n")
    soup = BeautifulSoup(html, 'html.parser')
    content = soup.find(id='img-content')
    #有些文章已经被删除了,就提取不到content了
    if content:
        title = content.find(class_='rich_media_title').text.strip()
        print(title)
        # \/:*?"<>|这些字符不能作为文件名或者文件夹名
        filename = title.replace('\/','-').replace('\\','-').replace('\"','-').replace('|','-').replace(":","_").replace("?","_").replace("*","_").replace("<","_").replace(">","_") + '.md'
        try:
            copyright = content.find(class_='wx_tap_link js_wx_tap_highlight rich_media_meta icon_appmsg_tag appmsg_title_tag weui-wa-hotarea').text
        except:
            copyright = None
        try:
            author = content.find(class_='wx_tap_link js_wx_tap_highlight rich_media_meta_link weui-wa-hotarea').text
        except:
            author = content.find(class_ = "rich_media_meta rich_media_meta_text").text.split("\n")[-2].split(" ")[-1]
        blog_name = content.find(class_='profile_nickname').text
        blog_sign = content.div.div.div.find_all('p')[1].span.text
        for i in soup.html.body.find_all('script'):
            if 'publish_time' in str(i):
                publish_time = int(i.text.split("document.getElementById")[0].split("{e(")[-1].split(",\"")[-1].split("\",")[0])
        content.find(id='meta_content').decompose()
        download_images(soup, content)
        brs = content.find_all('br')
        if brs:
            for br in brs:
                br.decompose()
        for_list(content)
        for_code2(content)
        mdText = str(content).replace('<br/>','').replace('</code><code>', "").replace('\n</code>','</code>').replace('<code>\n','<code>')
        # 代码部分自动换行
        mdText = tomd.Tomd(mdText).markdown
        # 改格式转错的地方
        mdText = mdText.replace('# \n            \n'+title, '## '+title).replace("# \n", "\n")
        mdText = re.sub(r'\n[\- ]+[ ]+\n```', "\n \n```", mdText)
        with open(filename, 'w', encoding='utf8') as file:
            file.write(mdText)
代码有点长,调起来费了九牛二虎之力,这里试的是微信公众号的文章转markdown,可能和其他网页不是那么匹配,主要还是要去熟悉soup的使用方法,并坚持将匹配错误的地方各个击破。
没有实现的功能:
- 视频没办法处理
- 不能百分百准确,文字和图谱还是偶有缺失,缺失率小于1%
- 有些文字格式会跑掉
- 列表不能准确转换,太懒了,懒得写了
- 基础版,特殊格式字符什么的,无法正确转换,需要自己开发
- 转换完扔需要校验,无法自动调校图片大小,需要手动
上面代码主要实现的功能:
- 
    代码模块可以成功换行了 
- 
    图片可以自动下载保存到本地并赋予链接 
- 
    一些匹配错误的地方做了校正 
 
     
            
        上篇/0.png) 
 
                    
                
            
Comments