声明:基本步骤和核心方法均参考[1],[2],细节有大不同,这篇着重于如何将抓取到的文章用markdown的格式保存下来。
1. 上篇中已经抓到了公众号的所有文章的题目和链接,下篇直接读入:
import pandas as pd
from newspaper import Article
import html2text as ht
import yaml
import time
import requests
from bs4 import BeautifulSoup
import os
import tomd
import re
with open("chaos-gravity.yaml", "r") as file:
file_data = file.read()
config = yaml.safe_load(file_data)
def get_headers(config):
headers = {
"Cookie": config['cookie'],
"User-Agent": config['user-agent']
}
return headers
pd.set_option('max_colwidth', 1000)
article_list = pd.read_csv('article_list2.csv', encoding='GB18030')
headers = get_headers(config)
def download_images(soup, content):
dir_name = "assets/images/" + str(filename[:-3]) + "/"
if not os.path.isdir(dir_name):
os.mkdir(dir_name)
cnt = 1
images = content.find_all("img")
for image in images:
img_src = image.get('data-src')
img_type = image.get('data-type')
img_name = "{0:d}.{1:s}".format(cnt, img_type if img_type else 'png')
cnt += 1
file_path = "assets/images/{0:s}/{1:s}".format(filename[:-3], img_name)
if not os.path.exists(file_path):
with open (file_path, 'wb') as file:
response = requests.get(url = img_src)
for block in response.iter_content(1024):
if block:
file.write(block)
else:
break
tag = soup.new_tag('span')
tag.string = "![](assets/images/{0:s}/{1:s})".format(filename[:-3], img_name)
image.replace_with(tag)
def download_videos(soup, content):
dir_name = "assets/videos/" + str(title) + "/"
if not os.path.isdir(dir_name):
os.mkdir(dir_name)
if videos:
for video in videos:
video_src = video.get('data-src')
tag = soup.new_tag('span')
tag.string = "[此处是视频]({0:s})".format(video_src)
video.replace_with(tag)
def for_list(content):
for a in content.find_all("ol"):
i = 1
for b in a.find_all("li"):
tag = soup.new_tag('span')
tag.string = "{0:d}. {1:s}\n".format(i, b.text)
b.replace_with(tag)
i = i+1
def for_code2(content):
for a in content.find_all(class_="code-snippet__fix code-snippet__js"):
for b in a.find_all("code"):
tag = soup.new_tag('code')
tag.string = "\n" + b.text
b.replace_with(tag)
def for_code(content):
for a in content.find_all(class_="code-snippet__fix code-snippet__js"):
tag = soup.new_tag('code')
string = ""
length = len(a.find_all("code"))
i = 1
for b in a.find_all("code"):
string = string + "\n" + b.text
if i<length:
b.extract()
i = i+1
tag.string = string
b.replace_with(tag)
for index, row in article_list.iterrows():
print(index)
print(row)
create_time = time.strftime("%Y-%m-%d",time.localtime(row["create_time"]))
url = row['link'].replace("\/","/")
article = requests.get(url,headers=headers)
if article.status_code == 200:
html = article.text
# for_code
html = html.replace("<br />","\n")
soup = BeautifulSoup(html, 'html.parser')
content = soup.find(id='img-content')
#有些文章已经被删除了,就提取不到content了
if content:
title = content.find(class_='rich_media_title').text.strip()
print(title)
# \/:*?"<>|这些字符不能作为文件名或者文件夹名
filename = title.replace('\/','-').replace('\\','-').replace('\"','-').replace('|','-').replace(":","_").replace("?","_").replace("*","_").replace("<","_").replace(">","_") + '.md'
try:
copyright = content.find(class_='wx_tap_link js_wx_tap_highlight rich_media_meta icon_appmsg_tag appmsg_title_tag weui-wa-hotarea').text
except:
copyright = None
try:
author = content.find(class_='wx_tap_link js_wx_tap_highlight rich_media_meta_link weui-wa-hotarea').text
except:
author = content.find(class_ = "rich_media_meta rich_media_meta_text").text.split("\n")[-2].split(" ")[-1]
blog_name = content.find(class_='profile_nickname').text
blog_sign = content.div.div.div.find_all('p')[1].span.text
for i in soup.html.body.find_all('script'):
if 'publish_time' in str(i):
publish_time = int(i.text.split("document.getElementById")[0].split("{e(")[-1].split(",\"")[-1].split("\",")[0])
content.find(id='meta_content').decompose()
download_images(soup, content)
brs = content.find_all('br')
if brs:
for br in brs:
br.decompose()
for_list(content)
for_code2(content)
mdText = str(content).replace('<br/>','').replace('</code><code>', "").replace('\n</code>','</code>').replace('<code>\n','<code>')
# 代码部分自动换行
mdText = tomd.Tomd(mdText).markdown
# 改格式转错的地方
mdText = mdText.replace('# \n \n'+title, '## '+title).replace("# \n", "\n")
mdText = re.sub(r'\n[\- ]+[ ]+\n```', "\n \n```", mdText)
with open(filename, 'w', encoding='utf8') as file:
file.write(mdText)
代码有点长,调起来费了九牛二虎之力,这里试的是微信公众号的文章转markdown,可能和其他网页不是那么匹配,主要还是要去熟悉soup的使用方法,并坚持将匹配错误的地方各个击破。
没有实现的功能:
- 视频没办法处理
- 不能百分百准确,文字和图谱还是偶有缺失,缺失率小于1%
- 有些文字格式会跑掉
- 列表不能准确转换,太懒了,懒得写了
- 基础版,特殊格式字符什么的,无法正确转换,需要自己开发
- 转换完扔需要校验,无法自动调校图片大小,需要手动
上面代码主要实现的功能:
-
代码模块可以成功换行了
-
图片可以自动下载保存到本地并赋予链接
-
一些匹配错误的地方做了校正
Comments