threeperson
发布于 2025-10-19 / 1 阅读
0
0

微信群采集-小红书版本

采集的群已经发布到了小程序平台,扫码可查看脚本采集效果。下方脚本是小红书群二维码采集脚本。


const puppeteer = require('puppeteer');
const fs = require('fs-extra');
const path = require('path');
let fetch;

const saveDir = 'C:/Users/xxxx/Desktop/qrcode';
const infoTxt = 'qrcode_info.txt';
const userDataPath = 'C:/chrome_dev';

async function downloadImage(url) {
    // 在 isWeixin
    // 只识别二维码,不下载图片,直接POST数据
    const headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'image/avif,image/webp,image/apng,image/*,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Referer': 'https://www.xiaohongshu.com/explore'
    };
    try {
        const res = await fetch(url, { headers });
        if (res.status === 200) {
            const arrayBuffer = await res.arrayBuffer();
            const buffer = Buffer.from(arrayBuffer);
            console.log('准备识别图片:', url, 'buffer长度:', buffer.length);
            // 识别二维码
            const isQR = await isWeixinQRCode(buffer);
            if (!isQR) {
                console.log('❌ 非微信2维码图片,跳过提交');
                return null;
            }
            // 识别为二维码,构建json并POST
            return buffer;
        } else {
            console.log('❌ 下载失败,状态码:', res.status);
        }
    } catch (e) {
        console.log('❌ 下载出错:', e);
    }
    return null;
}



async function saveInfo(title, expire, imageUrl) {
    const line = `{title:${title}, expire:${expire}, url:${imageUrl}}\n`;
    await fs.appendFile(infoTxt, line, 'utf8');
}



(async () => {
    fetch = (await import('node-fetch')).default;
    // 读取关键词文件
    const keywordsPath = path.join(__dirname, 'keywords_xhs.txt');
    let keywords = [];
    try {
        const raw = await fs.readFile(keywordsPath, 'utf8');
        keywords = raw.split(/\r?\n/).map(x => x.trim()).filter(x => x);
    } catch (e) {
        console.log('未找到 keywords.txt 或读取失败:', e);
        return;
    }

    for (const kw of keywords) {
        // 中文转 URL 编码
        const encoded = encodeURIComponent(kw);
        const searchUrl = `https://www.xiaohongshu.com/search_result?keyword=${encoded}&source=web_explore_feed`;
        console.log('开始处理关键词:', kw, searchUrl);
        await runDouyinSpiderWithUrl(searchUrl);
    }

    async function runDouyinSpiderWithUrl(searchUrl) {
        // 初始化 browser/page,只在第一次关键词时创建
        if (!global.__douyinPage) {
            await fs.writeFile(infoTxt, '', 'utf8');
            await fs.ensureDir(saveDir);
            global.__douyinBrowser = await puppeteer.launch({
                headless: false,
                executablePath: process.env.CHROME_PATH || 'C:/Program Files/Google/Chrome/Application/chrome.exe',
                userDataDir: userDataPath,
                args: [
                    '--start-maximized'
                ],
                defaultViewport: null
            });
            const [page] = await global.__douyinBrowser.pages();
            await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
            // 再次最大化窗口(防止部分系统无效)
            const session = await page.target().createCDPSession();
            const { windowId } = await session.send('Browser.getWindowForTarget');
            await session.send('Browser.setWindowBounds', { windowId, bounds: { windowState: 'maximized' } });
            await page.setRequestInterception(false);
            // 只注册一次 response 事件
            page.on('response', async (response) => {
                const url = response.url();
                if (url.startsWith('https://edith.xiaohongshu.com/api/sns/web/v1/search/notes')) {
                    try {
                        const text = await response.text();
                        const json = JSON.parse(text);
                        if (json.data && Array.isArray(json.data.items)) {
                            for (const item of json.data.items) {
                                const card = item.note_card;
                                if (!card) continue;
                                const title = card.display_title || '';
                                const coverUrl = card.cover && card.cover.url_default ? card.cover.url_default : '';
                                const tags = Array.isArray(card.corner_tag_info) ? card.corner_tag_info : [];
                                const isRecent = tags.some(tag => tag.text && tag.text.includes('小时'));
                                if (title && coverUrl && isRecent) {
                                    const buffer = await downloadImage(coverUrl);
                                    if (buffer) {
                                        const data = {
                                            title: title,
                                            qrcode: coverUrl,
                                            images: coverUrl
                                        };
                                        try {
                                            const resp = await fetch('https://xxx.xxx.com/api/xxx/xxx/upload', {
                                                method: 'POST',
                                                headers: {
                                                    'Content-Type': 'application/json'
                                                },
                                                body: JSON.stringify(data)
                                            });
                                            const result = await resp.text();
                                            console.log('已提交:', data, '返回:', result);
                                        } catch (err) {
                                            console.log('提交失败:', err);
                                        }
                                    }
                                    await new Promise(r => setTimeout(r, 1000));
                                }
                            }
                        }
                    } catch (e) {
                        // 忽略解析失败
                    }
                }
            });
            global.__douyinPage = page;
        }
        // 切换关键词时只刷新页面
        const page = global.__douyinPage;
        await page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
        try {
            // 自动点击筛选按钮,每次都重新查找,防止 ElementHandle 失效
            for (let i = 0; i < 2; i++) {
                await page.waitForSelector('div[class="filter"]', { timeout: 10000 });
                const divs = await page.$$('div[class="filter"]');
                let found = false;
                for (const div of divs) {
                    const span = await div.$('span');
                    if (span) {
                        const text = await page.evaluate(el => el.textContent, span);
                        if (text && text.includes('筛选')) {
                            try {
                                await span.hover();
                                found = true;
                                await page.waitForTimeout(2000);
                                break;
                            } catch (err) {
                                console.log('筛选按钮悬停失败,重试...', err);
                                continue;
                            }
                        }
                    }
                }
                if (!found) {
                    console.log('未找到筛选按钮,重试...');
                    await page.waitForTimeout(1000);
                }
            }
            let clicked = false;
            let retryCount = 0;
            while (!clicked && retryCount < 10) {
                await page.waitForTimeout(1000); // 避免过快轮询
                const timeElements = await page.$$('span');
                for (const el of timeElements) {
                    try {
                        const text = await page.evaluate(el => el.textContent, el);
                        if (text.includes('一天内')) {
                            try {
                                await el.click();
                                await page.waitForTimeout(2000);
                                clicked = true;
                                break;
                            } catch (err) {
                                console.log('"一天内"按钮点击失败,重试...', err);
                                continue;
                            }
                        }
                    } catch (err) {
                        // 元素已失效,跳过
                        continue;
                    }
                }
                if (!clicked) {
                    retryCount++;
                    console.log('未检测到"一天内"按钮,5秒后重试...');
                    await page.waitForTimeout(5000);
                }
            }
            if (clicked) {
                // 检测到并点击后,自动滑动
                console.log('已自动筛选"一天内",开始自动滑动...');
                let lastHeight = await page.evaluate('document.body.scrollHeight');
                let noChangeCount = 0;
                let maxNoChange = 5; // 最多滑动5次
                for (let i = 0; i < maxNoChange; i++) {
                    await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
                    await page.waitForTimeout(3000); // 每次滑动停留3秒
                    let newHeight = await page.evaluate('document.body.scrollHeight');
                    if (newHeight === lastHeight) {
                        noChangeCount++;
                        if (noChangeCount >= 2) { // 连续2次无新数据则提前停止
                            console.log('页面无新数据,提前停止滑动。');
                            break;
                        }
                    } else {
                        noChangeCount = 0;
                        lastHeight = newHeight;
                    }
                }
                console.log('滑动结束,页面已到底或无新内容。');
            } else {
                console.log('自动筛选失败,可手动筛选。');
            }
        } catch (e) {
            console.log('自动筛选失败,可手动筛选。', e);
        }
        // 不关闭浏览器,直接进入下一个关键词
    }
})();


评论