微信公众号定时监控

因为想着定时监控一些安全类的公众号的最新发布,所以网上找了找爬公众号的方法,不过都不太符合,所以就使用无头浏览器实现了一下。

一些获取公众号文章的方法:

  • Hook Windows 微信
  • 基于微信读书
  • 基于搜狗
  • 第三方平台
  • 基于微信公众号平台

这里主要是基于微信公众号平台实现,大概就 4 ~ 5 天需要登录一次,还可以接受。

实现思路:

  1. 访问 mp.weixin.qq.com 判断登录状态
  2. 如果没有登录,则抓取登录二维码,转换为可以在终端展示的格式输出( 因为要在服务器上面跑,所以要输出,大小就是那么大,没找到控制的办法 )
  3. 成功登录后就是模拟点击图文的页面,然后模拟输入微信公众号获取标题和链接
  4. 监控去重是根据链接,第一次的时候会把链接 hash 存储到文件中,然后后续就是基于这个文件去判断是否有新增

image-20250608131043544

代码中包含项目的其他库,如果需要使用的话摘出来爬虫的就可以了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import warnings

import os

import mmh3

from scripts.utils.save import save_weixin_md
from scripts.utils.ding import send_ding_msg, dict_format_md
from scripts.utils.init import init_config
from scripts.utils.logger import init_logger
from scripts.utils.hash import get_hashes, init_hashes, add_hash

from webdriver_manager.chrome import ChromeDriverManager
import undetected_chromedriver as uc
import time
from selenium.webdriver import Keys
from selenium.webdriver.common.by import By
import pyqrcode
from pyzbar.pyzbar import decode
from PIL import Image
import datetime

warnings.filterwarnings("ignore", category=ResourceWarning)

config = init_config('config/config.yaml')
logger = init_logger(config['logger'], 'weixin')
targets = config['weixin']['targets']
driver_executable_path = ChromeDriverManager().install()
token = ''

ding_secret = config['ding_web_hook']['secret']
ding_token = config['ding_web_hook']['token']
hexo_path = config['hexo_path']
hexo_post_path = os.path.join(hexo_path, 'source/_posts')
count = 0

def create_browser():
options = uc.ChromeOptions()
options.add_argument("--user-data-dir=./data/chrome-profile") # windows 下会卡住
options.add_argument("--no-sandbox")
options.add_argument("--headless=new")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-blink-features=AutomationControlled")
return uc.Chrome(options=options, headless=True, driver_executable_path=driver_executable_path)
# return uc.Chrome(options=options, headless=False, driver_executable_path=driver_executable_path) # windows 下适用


browser = create_browser()
logger.info('浏览器启动成功')


def reget_targets():
global config
global targets
config = init_config('config/config.yaml')
targets = config['weixin']['targets']

def get_qrcode():
browser.get("https://mp.weixin.qq.com/")
time.sleep(5)
qr_img = browser.find_element(By.CSS_SELECTOR, "img.login__type__container__scan__qrcode")
qrcode_file = 'data/qr_only.png'
qr_img.screenshot(qrcode_file)
img = Image.open(qrcode_file)
decoded_objects = decode(img)
qr_content = decoded_objects[0].data.decode('utf-8')
qr_url = pyqrcode.create(qr_content, error='L')
qr_text = qr_url.terminal(module_color='black', background='white', quiet_zone=1)
return qr_text


def check_login_status():
global token
logger.info('正在检测当前登录状态 ...')
browser.get("https://mp.weixin.qq.com/")
time.sleep(5)
if '/cgi-bin/home?t=home/index&lang=zh_CN&token=' in browser.current_url:
token = browser.current_url.split('token=')[1]
logger.info('检测到已经登录, token={}'.format(token))
return True
if token:
send_ding_msg(
title="微信公众号登录",
content="请重新扫码登录",
access_token=ding_token,
secret=ding_secret,
)
logger.info('检测到未登录或已失效')
value = input("请输入 Y 获取二维码进行登录或者 N 退出: ")
if value == 'Y':
logger.info('请在 1 分钟内登录: ')
print(get_qrcode())
time.sleep(60)
return check_login_status()
if value == 'N':
logger.info('bye bye !')
exit(0)
return False


def get_articles(name):
"""
获取公众号最新 4 篇文章
:param name:
:return:
"""
posts = []
logger.info('开始获取 {} 的最新文章 ...'.format(name))
href = 'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=10&token={}&lang=zh_CN'.format(
token)
browser.get(href)
logger.info("已打开图文消息页面")
time.sleep(5)
link_element = browser.find_element(By.CSS_SELECTOR, "li#js_editor_insertlink")
link_element.click()
logger.info("已点击超链接")
time.sleep(3)
other_account_button = browser.find_element(By.CSS_SELECTOR,
"div.weui-desktop-btn_wrp.weui-desktop-link-btn.weui-desktop-link > button.weui-desktop-btn.weui-desktop-btn_default")
other_account_button.click()
logger.info("已点击'选择其他账号'按钮")
time.sleep(5)

input_element = browser.find_element(By.CSS_SELECTOR,
"input.weui-desktop-form__input[placeholder='输入文章来源的账号名称或微信号,回车进行搜索']")
input_element.send_keys(name)
input_element.send_keys(Keys.RETURN)
time.sleep(5)

first_result = browser.find_element(By.CSS_SELECTOR,
"ul.inner_link_account_list > li.inner_link_account_item:first-child")
logger.info('选择公众号: {}'.format(first_result.text))
first_result.click()
time.sleep(5)

articles = browser.find_elements(By.CSS_SELECTOR, "label.inner_link_article_item")
for article in articles:
title = article.find_element(By.CSS_SELECTOR, "div.inner_link_article_title").text
date = article.find_element(By.CSS_SELECTOR, "div.inner_link_article_date > span:nth-child(1)").text
link = article.find_element(By.CSS_SELECTOR,
"div.inner_link_article_date > span.weui-desktop-vm_default > a").get_attribute(
"href")
logger.info(title)
posts.append({
'title': title,
"name": name,
'link': link,
'mmh3_hash': str(mmh3.hash(link, signed=False))
})
return posts


def restart_browser():
global browser
logger.info('开始重启浏览器 ...')
try:
browser.quit()
except Exception as e:
logger.error(e)
finally:
browser = create_browser()


def run():
"""
未知原因会出现 [ERROR] HTTPConnectionPool(host='localhost', port=36541): Read timed out. (read timeout=120)
如果出现就关闭浏览器重启
"""
check_login_status()
reget_targets()
global count
for target in targets:
posts = get_articles(target)
if not posts:
continue
hash_file = 'data/weixin_{}_hash.txt'.format(target)
hashes = get_hashes(hash_file)
if not hashes:
logger.info('未检测到 hashes 进行初始化')
init_hashes(posts, hash_file)
logger.info('初始化成功, 本地检索结束 !')
continue
for post in posts:
mmh3_hash = post['mmh3_hash']
if mmh3_hash in hashes:
continue
logger.info('发现新增文章: {}'.format(post['title']))
count += 1
add_hash(mmh3_hash, hash_file)
save_weixin_md(hexo_post_path, post)
send_ding_msg(
title="微信公众号更新",
content=dict_format_md(post),
access_token=ding_token,
secret=ding_secret,
)
if count:
from hexo_puhish import hexo_puhish
hexo_puhish()
count = 0

while True:
try:
run()
except Exception as e:
logger.error(e)
restart_browser()
continue
now = datetime.datetime.utcnow()
next_time = now + datetime.timedelta(hours=1)
logger.info('本次任务执行成功, 下次任务执行时间: {}'.format(next_time.isoformat()))
time.sleep(30 * 60)

启动后就是这样,扫码登录即可,可以在 Linux 下运行,安装一下 chrom 就行。

image-20250608130435403

服务器上运行日志:

image-20250608131146855

由于二维码特别大,并且并没有找到什么合适的方法去让输出的变小,就只能从链接入手了,通过短链接来实现缩小二维码的功能:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def get_short_link(link) -> str:
"""
通过 禾令奇 提供的短链接生成来进行链接缩短
:param link:
:return:
"""
import requests
try:
url = "https://www.helingqi.com/url.php"
headers = {
"Sec-Ch-Ua": '"Chromium";v="121", "Not A(Brand";v="99"',
"Accept": "application/json, text/javascript, */*; q=0.01",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"X-Requested-With": "XMLHttpRequest",
"Sec-Ch-Ua-Mobile": "?0",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.85 Safari/537.36",
"Sec-Ch-Ua-Platform": '"Windows"',
"Msg": "Think You !!!",
"Origin": "https://www.helingqi.com",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Dest": "empty",
"Referer": "https://www.helingqi.com/url.php",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Priority": "u=1, i"
}
data = {
"url": link,
"type": "sina"
}
res = requests.post(url, headers=headers, data=data, verify=False)
res.raise_for_status()
return res.json()["data"][0]
except Exception as e:
logger.error(e)
return ''

可以看到实际上是缩短成功了,不过其实并没有成功,因为扫码的时候微信就会提示是外部链接:

image-20250621213430881


微信公众号定时监控
https://liancccc.github.io/2025/06/21/技术/开发/微信公众号定时监控/
作者
守心
发布于
2025年6月21日
许可协议