yandex 图片自动下载

2022-04-12 00:00:00 图片 默认 超时 关键词 下载

一个在 yandex 上搜索图片并下载到本地的 node cli 程序.

使用帮助:
$0 <搜索关键词> [-t=超时(默认 1000)] [-r 是否替换下载(默认 false)]

示例:
$0 shop # 搜索 shop 并下载
$0 shop -t=0 # 不使用超时
$0 shop -t=0 -r # 覆盖重名文件

一个群友让做的程序, ¥100. 需求就是使用 yandex 搜索图片, 并自动下载到本地.

分析

  • 打开 https://yandex.com/images/search?text=shop
  • 查看 images/search? 请求, p={当前页数}
  • 返回结果 blocks[0].params.lastPage 是大页数
  • 返回结果中所有 img_url={原图地址}

win 下简单的命令行交互

这里用批处理实现, 其实 node 中也可以做交互的.

C:.
│  yandex 图片自动下载.bat
│
└─lib
        node_modules
        index.js
        node.exe

yandex 图片自动下载.bat 文件内容

color f0 && chcp 936 && cls && @echo off
title yandex 图片自动下载
mode con cols=120 lines=30
:down
set /p keyword=输入关键词开始下载: 
.\lib\node.exe .\lib\index.js "%keyword%"
echo ==========================
echo 全部下载完成
goto :down

源码

index.js

const fs = require('fs')
const url = require('url')
const path = require('path')
const fly = require('flyio')
const text = process.argv[2]

if(text === undefined) {
  console.log(`
使用帮助:
$0 <搜索关键词> [-t=超时(默认 10000)] [-r 是否替换下载(默认 false)]

示例:
$0 shop # 搜索 shop 并下载
$0 shop -t=0 # 不使用超时
$0 shop -t=0 -r # 覆盖重名文件
`)
return
}

const argv = parseArgv()
const userConfig = {
  text, // 用户搜索的关键词
  timeout: argv['-t'] === undefined ? 10000 : Number(argv['-t']), // 设置超时时间, 默认 10 秒
  replace: argv['-r'] || false, // 是否替换下载, 默认否
}

const config = userConfig
fly.config.timeout = config.timeout
const downHistoryFile = `${process.cwd()}/download.txt`
fs.writeFileSync(downHistoryFile, '')
const downDir = `${process.cwd()}/download`

new Promise(async () => {
  let text = userConfig.text
  let {urls: lastUrls = [], lastPage = } = (await getImgUrls({text}).catch(err => console.log('err', err))) || {}
  if(lastPage === ) {
    console.log('没有搜索结果, 你可以更换关键词重试')
    return
  }
  for (let p = ; p < lastPage; p++) {
    let curPage = p + 1
    let urls = []
    if(curPage === 1) {
      urls = lastUrls
    } else {
      let getRes = await getImgUrls({text, p: curPage}).catch(err => console.log('err', err))
      urls = (getRes || {}).urls || [] // 获取页数
    }
    urls = [...new Set(urls)] // 去除重复的 url
    // urls = urls.slice(0, 3)
    const paseUrls = []
    urls.map(item => {
      const {pathname, href} = url.parse(item)
      let {base, ext, name} = path.parse(pathname);
      ext = ext || '.jpg' // 如果没有扩展名时, 默认给定为 jpg
      // let fullName = paseUrls.find(item => item.fullName === base)  // 如果保存文件名重复, 则添加时间戳处理
      //   ? `${name}_${uuid()}_${ext}`
      //   : base
      // let fullName = `${name}_${uuid()}_${ext}`
      let fullName = `${name || uuid()}${ext}`
      paseUrls.push({
        name,
        fullName,
        href,
      })
    })
    console.log(`正在下载第 ${curPage}/${lastPage}`)
    await Promise.all(paseUrls.map(item => download(item.href, item.fullName))).finally(async () => {
      console.log(`${curPage}/${lastPage} 页下载完成`)
    })
  }
})


function hasFile(filePath) {
  return fs.existsSync(filePath)
}

function parseArgv() {
  return process.argv.slice(2).reduce((acc, arg) => {
    let [k, v = true] = arg.split('=')
    acc[k] = v
    return acc
  }, {})
}

async function getImgUrls(arg = {}) {
  arg = {
    text: 'shop',
    p: 1,
    ...arg,
  }
  return new Promise((resolve, reject) => {
    console.log(`正在查找图片, 关键词: ${arg.text}, 页: ${arg.p}`)
    let api = `https://yandex.com/images/search?format=json&request=%7B%22blocks%22%3A%5B%7B%22block%22%3A%22serp-controller%22%2C%22params%22%3A%7B%7D%2C%22version%22%3A2%7D%2C%7B%22block%22%3A%22serp-list_infinite_yes%22%2C%22params%22%3A%7B%22initialPageNum%22%3A0%7D%2C%22version%22%3A2%7D%2C%7B%22block%22%3A%22more_direction_next%22%2C%22params%22%3A%7B%7D%2C%22version%22%3A2%7D%2C%7B%22block%22%3A%22gallery__items%3Aajax%22%2C%22params%22%3A%7B%7D%2C%22version%22%3A2%7D%5D%2C%22bmt%22%3A%7B%22lb%22%3A%22%22%7D%2C%22amt%22%3A%7B%22las%22%3A%22justifier-height%3D1%3Bthumb-underlay%3D1%3Bjustifier-setheight%3D1%3Bfitimages-height%3D1%3Bjustifier-fitincuts%3D1%3Bchunk.219.0%3D1%3Bchunk.169.0%3D1%22%7D%7D&yu=2650822041574437239&p=${arg.p}&text=${encodeURIComponent(arg.text)}&rpt=image&uinfo=sw-1920-sh-1200-ww-905-wh-1045-pd-2-wp-16x10_2560x1600`
    // console.log('api', api)
    fly.get(api)
      .then(res => {
        const data = res.data
        handleData(data, r => resolve(r))
      }).catch(err => {
        reject(err)
      })
  })
}

function execSync(cmd, out = false) {
  const child_process = require('child_process')
  let str = child_process.execSync(cmd).toString().trim()
  out && console.log(str)
  return str
}

async function handleData(data, cb) {
  try {
    const lastPage = data.blocks[].params.lastPage
    const urls = []
    data.blocks.forEach(item => {
      (item.html.match(/img_url=.*?&/g) || []).forEach(item => {
        urls.push(decodeURIComponent((item.match(/img_url=(.*?)&/) || [])[1] || ""))
      })
    })
    cb({
      lastPage,
      urls,
    })
  } catch (error) {
    console.log('error', error)
    cb({
      msg: '搜索结果出现错误',
      error,
      lastPage: ,
      urls: [],
    })
  }
}

async function download(u, p) {
  fs.writeFileSync(downHistoryFile, fs.readFileSync(downHistoryFile) + u + '\r\n')
  const savePath = `${downDir}/${p}`
  if(!config.replace && hasFile(savePath)) {
    console.log(`跳过已下载 ${p}`)
    return true
  } else {
    if(!hasFile(downDir)) {
      fs.mkdirSync(downDir)
    }
    return fly.download(u, savePath)
    .then(d => {
      console.log(`下载成功: ${p}`)
    })
    .catch(err => console.log('err', err.message))
  }
}
function uuid(sep = '') {
  let increment = process.increment === undefined ? (process.increment = 1) : (process.increment = (process.increment + 1))
  return `${Date.now()}_${increment}`.replace(/_/g, sep)
}

package.json

{
  "name": "getImg",
  "version": "1.0.0",
  "description": "",
  "main": "index.js",
  "keywords": [],
  "author": "xw",
  "dependencies": {
    "flyio": "^0.6.14",
    "request": "^2.88.0"
  },
  "license": "ISC"
}

注:
本程序有个小问题, 懒得解决, 看大家能不能发现.

来源 https://www.cnblogs.com/daysme/p/11919221.html

相关文章