cover-crawler/crawler.go

342 lines
7.7 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package main
import (
"bytes"
"errors"
"fmt"
"github.com/spf13/viper"
"golang.org/x/sync/errgroup"
"hash/crc32"
"image"
_ "image/jpeg"
"io"
"net/http"
"os"
"path/filepath"
"strconv"
"strings"
"time"
"unicode"
)
type Crawler struct {
avPath string
outputPath string
config *viper.Viper
client *http.Client
}
type coverCode struct {
letters string
number int
}
func NewCrawler(avPath, outputPath string) *Crawler {
config := viper.New()
config.SetConfigName("config")
config.AddConfigPath(".")
config.SetConfigType("yaml")
if err := config.ReadInConfig(); err != nil {
fmt.Printf("读取配置文件发生错误, %s", err)
}
return &Crawler{
avPath: avPath,
outputPath: outputPath,
config: config,
client: &http.Client{
Timeout: 15 * time.Second,
Transport: &http.Transport{
MaxIdleConns: 10,
IdleConnTimeout: 30 * time.Second,
DisableCompression: true,
},
},
}
}
// 检查文件是否是视频文件
func (c *Crawler) isVideoFile(fileName string) bool {
videoExtensions := c.config.GetStringSlice("crawler.video")
ext := strings.ToLower(filepath.Ext(fileName))
for _, videoExt := range videoExtensions {
if ext == videoExt {
return true
}
}
return false
}
// 检查文件是否是 JPG 文件
func (c *Crawler) isJPGFile(fileName string) bool {
ext := strings.ToLower(filepath.Ext(fileName))
return ext == ".jpg"
}
// 获取文件 CRC32 哈希值
func (c *Crawler) getFileInfo(filePath string) (uint32, error) {
file, err := os.Open(filePath)
if err != nil {
return 0, err
}
defer file.Close()
// 使用快速 CRC32 校验,仅读取前 4KB 内容
buf := make([]byte, 4096)
n, err := file.Read(buf)
if err != nil && err != io.EOF {
return 0, err
}
return crc32.ChecksumIEEE(buf[:n]), nil
}
// 获取代码数字
func (c *Crawler) getCodeNum(s string) int {
runes := []rune(s)
if len(runes) < 3 {
return 0
}
for i := 0; i < 3; i++ {
if !unicode.IsDigit(runes[i]) {
return 0
}
}
num, _ := strconv.Atoi(string(runes[:3]))
return num
}
// 获取封面代码列表
func (c *Crawler) getCoverCodeList(files []string) (coverList []coverCode) {
for _, file := range files {
// 去除域名部分
if strings.IndexRune(file, '@') > 0 {
file = strings.Split(file, "@")[1]
}
nameSlice := strings.Split(file, "-")
if len(nameSlice) < 2 || len(nameSlice[0]) > 5 {
continue
}
num := c.getCodeNum(nameSlice[1])
if num == 0 {
continue
}
coverList = append(coverList, coverCode{
letters: strings.ToLower(nameSlice[0]),
number: num,
})
}
return coverList
}
// 组装封面图片地址
func (c *Crawler) getCoverImgUrl(code coverCode) string {
codePrestige := c.config.GetStringSlice("crawler.code.prestige")
for _, pCode := range codePrestige {
if code.letters == pCode {
url := strings.ReplaceAll(c.config.GetString("crawler.url.prestige"), `*`, code.letters)
url = strings.ReplaceAll(url, `#`, fmt.Sprintf("%03d", code.number))
return url
}
}
format := "%s%05d"
if len(code.letters) > 4 {
format = "1%s%05d"
}
codeStr := fmt.Sprintf(format, code.letters, code.number)
return strings.ReplaceAll(c.config.GetString("crawler.url.other"), `*`, codeStr)
}
// 获取封面图片
func (c *Crawler) fetchCoverImg(code coverCode) error {
if len(code.letters) < 2 || code.number < 1 {
return nil
}
imgUrl := c.getCoverImgUrl(code)
suffix := filepath.Ext(imgUrl)
fileName := fmt.Sprintf("%s-%03d%s",
strings.ToUpper(code.letters),
code.number,
suffix,
)
filePath := filepath.Join(c.outputPath, fileName)
req, err := http.NewRequest(http.MethodGet, imgUrl, nil)
if err != nil {
return fmt.Errorf("创建请求失败: %w", err)
}
// 模拟浏览器请求
userAgents := []string{
"Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
"AppleWebKit/537.36 (KHTML, like Gecko)",
"Chrome/120.0.0.0 Safari/537.36",
}
req.Header.Set("User-Agent", userAgents[time.Now().UnixNano()%int64(len(userAgents))])
resp, err := c.client.Do(req)
if err != nil {
return fmt.Errorf("请求失败: %w", err)
}
defer resp.Body.Close()
// 检查 HTTP 状态码
if resp.StatusCode != http.StatusOK {
return nil
}
// 先完整读取 HTTP Body
imgData, err := io.ReadAll(resp.Body)
if err != nil {
return fmt.Errorf("读取封面数据失败(%s: %w", fileName, err)
}
// 使用内存数据解码图片
img, _, err := image.DecodeConfig(bytes.NewReader(imgData))
if err != nil {
return fmt.Errorf("图片解码失败: %w", err)
}
// 图片高度未到达配置最低值则抛弃
if img.Height < c.config.GetInt("crawler.minHeight") {
return nil
}
// 将内存数据写入文件
if err := os.WriteFile(filePath, imgData, 0644); err != nil {
return fmt.Errorf("文件写入失败: %w", err)
}
return nil
}
// 获取作品存放目录视频文件列表
func (c *Crawler) getAVPathVideoList() (videoFiles []string, err error) {
// 用于去重
uniqueFiles := make(map[string]struct{})
if err := filepath.Walk(c.avPath, func(path string, info os.FileInfo, err error) error {
if err != nil {
return fmt.Errorf("访问路径 %s 失败: %w", path, err)
}
// 目录过滤
if info.IsDir() {
return nil
}
// 仅处理视频文件
if !c.isVideoFile(info.Name()) {
return nil
}
baseName := strings.TrimSuffix(info.Name(), filepath.Ext(info.Name()))
// 获取文件哈希
fileHash, err := c.getFileInfo(path)
if err != nil {
return fmt.Errorf("获取文件哈希失败 %s: %w", baseName, err)
}
// 使用文件名+哈希值作为唯一标识
uniqueID := fmt.Sprintf("%s-%d", baseName, fileHash)
if _, exists := uniqueFiles[uniqueID]; !exists {
uniqueFiles[uniqueID] = struct{}{}
videoFiles = append(videoFiles, baseName)
}
return nil
}); err != nil {
return nil, fmt.Errorf("作品存放目录遍历失败: %w", err)
}
return videoFiles, nil
}
// 获取输出目录已存在的封面列表
func (c *Crawler) getOutPathCoverList() (coverList []coverCode, err error) {
// 用于去重
uniqueFiles := make(map[string]struct{})
if err := filepath.Walk(c.avPath, func(path string, info os.FileInfo, err error) error {
if err != nil {
return fmt.Errorf("访问路径 %s 失败: %w", path, err)
}
// 目录过滤
if info.IsDir() {
return nil
}
// 仅处理图片文件
if !c.isJPGFile(info.Name()) {
return nil
}
baseName := strings.TrimSuffix(info.Name(), filepath.Ext(info.Name()))
if _, exists := uniqueFiles[baseName]; !exists {
uniqueFiles[baseName] = struct{}{}
nameSlice := strings.Split(baseName, "-")
coverList = append(coverList, coverCode{
letters: strings.ToLower(nameSlice[0]),
number: c.getCodeNum(nameSlice[1]),
})
}
return nil
}); err != nil {
return nil, fmt.Errorf("输出目录遍历失败: %w", err)
}
return coverList, nil
}
func (c *Crawler) Handle() error {
if c.avPath == "未选择" || c.outputPath == "未选择" {
return errors.New("请选择作品存放目录或输出目录")
}
videoFiles, err := c.getAVPathVideoList()
if err != nil {
return err
}
coverList := c.getCoverCodeList(videoFiles)
existCovers, err := c.getOutPathCoverList()
if err != nil {
return err
}
// 过滤已存在的封面
if len(existCovers) > 0 {
// 创建哈希表用于快速查找
existMap := make(map[coverCode]struct{})
for _, c := range existCovers {
existMap[c] = struct{}{}
}
// 创建新切片过滤已存在项
filtered := make([]coverCode, 0, len(coverList))
for _, item := range coverList {
if _, exists := existMap[item]; !exists {
filtered = append(filtered, item)
}
}
coverList = filtered
}
var g errgroup.Group
for _, cover := range coverList {
g.Go(func() error {
return c.fetchCoverImg(cover)
})
}
return g.Wait()
}