threeperson
发布于 2025-09-06 / 17 阅读
0
0

微信群采集-抖音版本

https://h5.threeperson.com/#/ 群二维码平台,每天更新300-500群二维码,采集平台主要是来自抖音。

脚本如下。

// dy.js - 抖音自动化爬虫(核心功能 JS 版)

const saveDir = 'C:/Users/40650/Desktop/qrcode';
const infoTxt = 'qrcode_info.txt'
async function downloadImage(url) {

    // 在 isWeixin
    // 只识别二维码,不下载图片,直接POST数据
    const headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'image/avif,image/webp,image/apng,image/*,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Referer': 'https://www.douyin.com/'
    };
    try {
        const res = await fetch(url, { headers });
        if (res.status === 200) {
            const arrayBuffer = await res.arrayBuffer();
            const buffer = Buffer.from(arrayBuffer);
            console.log('准备识别图片:', url, 'buffer长度:', buffer.length);
            // 识别二维码
            const isQR = await isWeixinQRCode(buffer);
            if (!isQR) {
                console.log('❌ 非微信2维码图片,跳过提交');
                return null;
            }
            // 识别为二维码,构建json并POST
            return buffer;
        } else {
            console.log('❌ 下载失败,状态码:', res.status);
        }
    } catch (e) {
        console.log('❌ 下载出错:', e);
    }
    return null;
}

// 判断图片 buffer 是否为2二维码(动态 import 兼容 ESM)
// 使用 sharp + @zxing/library 实现 buffer 2维码识别
async function isWeixinQRCode(buffer) {
    console.log('开始识别2维码...');
    const sharp = (await import('sharp')).default || (await import('sharp'));
    const ZXingModule = await import('@zxing/library');
    const { MultiFormatReader, BarcodeFormat, RGBLuminanceSource, BinaryBitmap, HybridBinarizer, DecodeHintType } = ZXingModule;
    // 多种预处理:原图、灰度、反色、灰度反色
    const preprocessList = [
        async img => img,
        async img => img.clone().greyscale(),
        async img => img.clone().negate(),
        async img => img.clone().greyscale().negate(),
    ];
    try {
        for (const preprocess of preprocessList) {
            let image = sharp(buffer);
            image = await preprocess(image);
            const { width, height } = await image.metadata();
            const raw = await image.ensureAlpha().raw().toBuffer();
            const luminances = new Uint8ClampedArray(width * height);
            for (let i = 0; i < width * height; i++) {
                // 灰度 = R*0.299 + G*0.587 + B*0.114
                const r = raw[i * 4];
                const g = raw[i * 4 + 1];
                const b = raw[i * 4 + 2];
                luminances[i] = 0.299 * r + 0.587 * g + 0.114 * b;
            }
            const source = new RGBLuminanceSource(luminances, width, height);
            const bitmap = new BinaryBitmap(new HybridBinarizer(source));
            const reader = new MultiFormatReader();
            const hints = new Map();
            hints.set(DecodeHintType.POSSIBLE_FORMATS, [BarcodeFormat.QR_CODE]);
            reader.setHints(hints);
            try {
                const result = reader.decode(bitmap);
                const qrText = result.getText().toLowerCase();
                console.log('✅ 检测图片内容:', qrText);
                if (qrText.includes('weixin') || qrText.includes('wx') || qrText.includes('wechat')) {
                    console.log('✅ 检测到微信二维码:', qrText);
                    return true;
                } else {
                    console.log('❌ 检测到2维码,但不是2维码:', qrText);
                    return false;
                }
            } catch (err) {
                // 本次预处理未识别,继续尝试下一个
            }
        }
        console.log('❌ ZXing 所有预处理均未识别2维码');
        return false;
    } catch (e) {
        console.log('❌ isWeixinQRCode 整体异常:', e);
        return false;
    }
}

async function saveInfo(title, expire, imageUrl) {
    const line = `{title:${title}, expire:${expire}, url:${imageUrl}}\n`;
    await fs.appendFile(infoTxt, line, 'utf8');
}



(async () => {
    fetch = (await import('node-fetch')).default;
    // 读取关键词文件
    const keywordsPath = path.join(__dirname, 'keywords.txt');
    let keywords = [];
    try {
        const raw = await fs.readFile(keywordsPath, 'utf8');
        keywords = raw.split(/\r?\n/).map(x => x.trim()).filter(x => x);
    } catch (e) {
        console.log('未找到 keywords.txt 或读取失败:', e);
        return;
    }

    for (const kw of keywords) {
        // 中文转 URL 编码
        const encoded = encodeURIComponent(kw);
        const searchUrl = `https://www.douyin.com/search/${encoded}?aid=68376e3e-b0ff-489d-993a-b3f20675db5b&type=general`;
        console.log('开始处理关键词:', kw, searchUrl);
        await runDouyinSpiderWithUrl(searchUrl);
    }

    async function runDouyinSpiderWithUrl(searchUrl) {
        // 初始化 browser/page,只在第一次关键词时创建
        if (!global.__douyinPage) {
            await fs.writeFile(infoTxt, '', 'utf8');
            await fs.ensureDir(saveDir);
            global.__douyinBrowser = await puppeteer.launch({
                headless: false,
                args: ['--start-maximized'],
                defaultViewport: null,
                executablePath: process.env.CHROME_PATH || 'C:/Program Files/Google/Chrome/Application/chrome.exe'
            });
            const [page] = await global.__douyinBrowser.pages();
            await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
            // 再次最大化窗口(防止部分系统无效)
            const session = await page.target().createCDPSession();
            const { windowId } = await session.send('Browser.getWindowForTarget');
            await session.send('Browser.setWindowBounds', { windowId, bounds: { windowState: 'maximized' } });
            await page.setRequestInterception(false);
            // 只注册一次 response 事件
            page.on('response', async (response) => {
                const url = response.url();
                if (url.startsWith('https://www.douyin.com/aweme/v1/web/general/search/single')) {
                    try {
                        const text = await response.text();
                        const json = JSON.parse(text);
                        if (json.data && Array.isArray(json.data)) {
                            const now = Math.floor(Date.now() / 1000);
                            const dayAgo = now - 24 * 60 * 60;
                            for (const item of json.data) {
                                if (item.aweme_info) {
                                    const aweme = item.aweme_info;
                                    const aweme_id = aweme.aweme_id || 'N/A';
                                    const desc = aweme.desc || '';
                                    const create_time = aweme.create_time || 0;
                                    if (create_time < dayAgo) {
                                        // 超过24小时,跳过
                                        continue;
                                    }
                                    let coverUrl = null;
                                    if (aweme.video && aweme.video.cover && aweme.video.cover.url_list && aweme.video.cover.url_list.length > 0) {
                                        coverUrl = aweme.video.cover.url_list[0];
                                    }
                                    if (coverUrl) {
                                        const buffer = await downloadImage(coverUrl);
                                        if (buffer) {
                                            let title = desc.slice(0, 30).replace(/\n|\r/g, ' ').trim() || '未知群名称';
                                            // 构建json对象,只包含图片地址
                                            const data = {
                                                title: title,
                                                qrcode: coverUrl,
                                                images: coverUrl
                                            };
                                            // 提交到接口
                                            try {
                                                const resp = await fetch('https://h5.threeperson.com/xxxxx', {
                                                    method: 'POST',
                                                    headers: {
                                                        'Content-Type': 'application/json'
                                                    },
                                                    body: JSON.stringify(data)
                                                });
                                                const result = await resp.text();
                                                console.log('已提交:', data, '返回:', result);
                                            } catch (err) {
                                                console.log('提交失败:', err);
                                            }
                                        }
                                        await new Promise(r => setTimeout(r, 1000));
                                    }
                                }
                            }
                        }
                    } catch (e) {
                        // 忽略解析失败
                    }
                }
            });
            global.__douyinPage = page;
        }
        // 切换关键词时只刷新页面
        const page = global.__douyinPage;
        await page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
        try {
            // 自动点击筛选按钮,每次都重新查找,防止 ElementHandle 失效
            for (let i = 0; i < 2; i++) {
                await page.waitForSelector('div[tabindex="0"] span', { timeout: 10000 });
                // 通过 div[tabindex="0"] 下 span 文本内容定位筛选按钮
                const divs = await page.$$('div[tabindex="0"]');
                console.log('-----divs.length=', divs.length);
                let found = false;
                for (const div of divs) {
                    const span = await div.$('span');
                    if (span) {
                        const text = await page.evaluate(el => el.textContent, span);
                        if (text && text.includes('筛选')) {
                            try {
                                await span.hover();
                                found = true;
                                await page.waitForTimeout(2000);
                                break;
                            } catch (err) {
                                console.log('筛选按钮点击失败,重试...', err);
                                continue;
                            }
                        }
                    }
                }
                if (!found) {
                    console.log('未找到筛选按钮,重试...');
                    // 通过 div[tabindex="0"] 下 span 文本内容定位筛选按钮
                const divs = await page.$$('div[tabindex="0"]');
                console.log('-----divs.length=', divs.length);
                let found = false;
                for (const div of divs) {
                    const span = await div.$('span');
                    if (span) {
                        const text = await page.evaluate(el => el.textContent, span);
                        if (text && text.includes('筛选')) {
                            try {
                                await span.hover();
                                found = true;
                                await page.waitForTimeout(2000);
                                break;
                            } catch (err) {
                                console.log('筛选按钮点击失败,重试...', err);
                                continue;
                            }
                        }
                    }
                }
                    await page.waitForTimeout(1000);
                }
            }
            let clicked = false;
            let retryCount = 0;
            while (!clicked && retryCount < 3) {
                await page.waitForTimeout(1000); // 避免过快轮询
                const timeElements = await page.$$('span.eXMmo3JR');
                for (const el of timeElements) {
                    try {
                        const text = await page.evaluate(el => el.textContent, el);
                        if (text.includes('一天内')) {
                            try {
                                await el.click();
                                await page.waitForTimeout(2000);
                                clicked = true;
                                break;
                            } catch (err) {
                                console.log('"一天内"按钮点击失败,重试...', err);
                                continue;
                            }
                        }
                    } catch (err) {
                        // 元素已失效,跳过
                        continue;
                    }
                }
                if (!clicked) {
                    retryCount++;
                    console.log('未检测到"一天内"按钮,5秒后重试...');
                    await page.waitForTimeout(5000);
                }
            }
            if (clicked) {
                // 检测到并点击后,自动滑动
                console.log('已自动筛选"一天内",开始自动滑动...');
                let lastHeight = await page.evaluate('document.body.scrollHeight');
                let noChangeCount = 0;
                let maxNoChange = 5; // 最多滑动5次
                for (let i = 0; i < maxNoChange; i++) {
                    await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
                    await page.waitForTimeout(3000); // 每次滑动停留3秒
                    let newHeight = await page.evaluate('document.body.scrollHeight');
                    if (newHeight === lastHeight) {
                        noChangeCount++;
                        if (noChangeCount >= 2) { // 连续2次无新数据则提前停止
                            console.log('页面无新数据,提前停止滑动。');
                            break;
                        }
                    } else {
                        noChangeCount = 0;
                        lastHeight = newHeight;
                    }
                }
                console.log('滑动结束,页面已到底或无新内容。');
            } else {
                console.log('自动筛选失败,可手动筛选。');
            }
        } catch (e) {
            console.log('自动筛选失败,可手动筛选。', e);
        }
        // 不关闭浏览器,直接进入下一个关键词
    }
})();


评论