采集的群已经发布到了小程序平台,扫码可查看脚本采集效果。下方脚本是小红书群二维码采集脚本。
const puppeteer = require('puppeteer');
const fs = require('fs-extra');
const path = require('path');
let fetch;
const saveDir = 'C:/Users/xxxx/Desktop/qrcode';
const infoTxt = 'qrcode_info.txt';
const userDataPath = 'C:/chrome_dev';
async function downloadImage(url) {
// 在 isWeixin
// 只识别二维码,不下载图片,直接POST数据
const headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'image/avif,image/webp,image/apng,image/*,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Referer': 'https://www.xiaohongshu.com/explore'
};
try {
const res = await fetch(url, { headers });
if (res.status === 200) {
const arrayBuffer = await res.arrayBuffer();
const buffer = Buffer.from(arrayBuffer);
console.log('准备识别图片:', url, 'buffer长度:', buffer.length);
// 识别二维码
const isQR = await isWeixinQRCode(buffer);
if (!isQR) {
console.log('❌ 非微信2维码图片,跳过提交');
return null;
}
// 识别为二维码,构建json并POST
return buffer;
} else {
console.log('❌ 下载失败,状态码:', res.status);
}
} catch (e) {
console.log('❌ 下载出错:', e);
}
return null;
}
async function saveInfo(title, expire, imageUrl) {
const line = `{title:${title}, expire:${expire}, url:${imageUrl}}\n`;
await fs.appendFile(infoTxt, line, 'utf8');
}
(async () => {
fetch = (await import('node-fetch')).default;
// 读取关键词文件
const keywordsPath = path.join(__dirname, 'keywords_xhs.txt');
let keywords = [];
try {
const raw = await fs.readFile(keywordsPath, 'utf8');
keywords = raw.split(/\r?\n/).map(x => x.trim()).filter(x => x);
} catch (e) {
console.log('未找到 keywords.txt 或读取失败:', e);
return;
}
for (const kw of keywords) {
// 中文转 URL 编码
const encoded = encodeURIComponent(kw);
const searchUrl = `https://www.xiaohongshu.com/search_result?keyword=${encoded}&source=web_explore_feed`;
console.log('开始处理关键词:', kw, searchUrl);
await runDouyinSpiderWithUrl(searchUrl);
}
async function runDouyinSpiderWithUrl(searchUrl) {
// 初始化 browser/page,只在第一次关键词时创建
if (!global.__douyinPage) {
await fs.writeFile(infoTxt, '', 'utf8');
await fs.ensureDir(saveDir);
global.__douyinBrowser = await puppeteer.launch({
headless: false,
executablePath: process.env.CHROME_PATH || 'C:/Program Files/Google/Chrome/Application/chrome.exe',
userDataDir: userDataPath,
args: [
'--start-maximized'
],
defaultViewport: null
});
const [page] = await global.__douyinBrowser.pages();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
// 再次最大化窗口(防止部分系统无效)
const session = await page.target().createCDPSession();
const { windowId } = await session.send('Browser.getWindowForTarget');
await session.send('Browser.setWindowBounds', { windowId, bounds: { windowState: 'maximized' } });
await page.setRequestInterception(false);
// 只注册一次 response 事件
page.on('response', async (response) => {
const url = response.url();
if (url.startsWith('https://edith.xiaohongshu.com/api/sns/web/v1/search/notes')) {
try {
const text = await response.text();
const json = JSON.parse(text);
if (json.data && Array.isArray(json.data.items)) {
for (const item of json.data.items) {
const card = item.note_card;
if (!card) continue;
const title = card.display_title || '';
const coverUrl = card.cover && card.cover.url_default ? card.cover.url_default : '';
const tags = Array.isArray(card.corner_tag_info) ? card.corner_tag_info : [];
const isRecent = tags.some(tag => tag.text && tag.text.includes('小时'));
if (title && coverUrl && isRecent) {
const buffer = await downloadImage(coverUrl);
if (buffer) {
const data = {
title: title,
qrcode: coverUrl,
images: coverUrl
};
try {
const resp = await fetch('https://xxx.xxx.com/api/xxx/xxx/upload', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(data)
});
const result = await resp.text();
console.log('已提交:', data, '返回:', result);
} catch (err) {
console.log('提交失败:', err);
}
}
await new Promise(r => setTimeout(r, 1000));
}
}
}
} catch (e) {
// 忽略解析失败
}
}
});
global.__douyinPage = page;
}
// 切换关键词时只刷新页面
const page = global.__douyinPage;
await page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
try {
// 自动点击筛选按钮,每次都重新查找,防止 ElementHandle 失效
for (let i = 0; i < 2; i++) {
await page.waitForSelector('div[class="filter"]', { timeout: 10000 });
const divs = await page.$$('div[class="filter"]');
let found = false;
for (const div of divs) {
const span = await div.$('span');
if (span) {
const text = await page.evaluate(el => el.textContent, span);
if (text && text.includes('筛选')) {
try {
await span.hover();
found = true;
await page.waitForTimeout(2000);
break;
} catch (err) {
console.log('筛选按钮悬停失败,重试...', err);
continue;
}
}
}
}
if (!found) {
console.log('未找到筛选按钮,重试...');
await page.waitForTimeout(1000);
}
}
let clicked = false;
let retryCount = 0;
while (!clicked && retryCount < 10) {
await page.waitForTimeout(1000); // 避免过快轮询
const timeElements = await page.$$('span');
for (const el of timeElements) {
try {
const text = await page.evaluate(el => el.textContent, el);
if (text.includes('一天内')) {
try {
await el.click();
await page.waitForTimeout(2000);
clicked = true;
break;
} catch (err) {
console.log('"一天内"按钮点击失败,重试...', err);
continue;
}
}
} catch (err) {
// 元素已失效,跳过
continue;
}
}
if (!clicked) {
retryCount++;
console.log('未检测到"一天内"按钮,5秒后重试...');
await page.waitForTimeout(5000);
}
}
if (clicked) {
// 检测到并点击后,自动滑动
console.log('已自动筛选"一天内",开始自动滑动...');
let lastHeight = await page.evaluate('document.body.scrollHeight');
let noChangeCount = 0;
let maxNoChange = 5; // 最多滑动5次
for (let i = 0; i < maxNoChange; i++) {
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForTimeout(3000); // 每次滑动停留3秒
let newHeight = await page.evaluate('document.body.scrollHeight');
if (newHeight === lastHeight) {
noChangeCount++;
if (noChangeCount >= 2) { // 连续2次无新数据则提前停止
console.log('页面无新数据,提前停止滑动。');
break;
}
} else {
noChangeCount = 0;
lastHeight = newHeight;
}
}
console.log('滑动结束,页面已到底或无新内容。');
} else {
console.log('自动筛选失败,可手动筛选。');
}
} catch (e) {
console.log('自动筛选失败,可手动筛选。', e);
}
// 不关闭浏览器,直接进入下一个关键词
}
})();