package main import ( "bytes" "errors" "fmt" "github.com/spf13/viper" "golang.org/x/sync/errgroup" "hash/crc32" "image" _ "image/jpeg" "io" "net/http" "os" "path/filepath" "strconv" "strings" "time" "unicode" ) type Crawler struct { avPath string outputPath string config *viper.Viper client *http.Client } type coverCode struct { letters string number int } func NewCrawler(avPath, outputPath string) *Crawler { config := viper.New() config.SetConfigName("config") config.AddConfigPath(".") config.SetConfigType("yaml") if err := config.ReadInConfig(); err != nil { fmt.Printf("读取配置文件发生错误, %s", err) } return &Crawler{ avPath: avPath, outputPath: outputPath, config: config, client: &http.Client{ Timeout: 15 * time.Second, Transport: &http.Transport{ MaxIdleConns: 10, IdleConnTimeout: 30 * time.Second, DisableCompression: true, }, }, } } // 检查文件是否是视频文件 func (c *Crawler) isVideoFile(fileName string) bool { videoExtensions := c.config.GetStringSlice("crawler.video") ext := strings.ToLower(filepath.Ext(fileName)) for _, videoExt := range videoExtensions { if ext == videoExt { return true } } return false } // 检查文件是否是 JPG 文件 func (c *Crawler) isJPGFile(fileName string) bool { ext := strings.ToLower(filepath.Ext(fileName)) return ext == ".jpg" } // 获取文件 CRC32 哈希值 func (c *Crawler) getFileInfo(filePath string) (uint32, error) { file, err := os.Open(filePath) if err != nil { return 0, err } defer file.Close() // 使用快速 CRC32 校验,仅读取前 4KB 内容 buf := make([]byte, 4096) n, err := file.Read(buf) if err != nil && err != io.EOF { return 0, err } return crc32.ChecksumIEEE(buf[:n]), nil } // 获取代码数字 func (c *Crawler) getCodeNum(s string) int { runes := []rune(s) if len(runes) < 3 { return 0 } for i := 0; i < 3; i++ { if !unicode.IsDigit(runes[i]) { return 0 } } num, _ := strconv.Atoi(string(runes[:3])) return num } // 获取封面代码列表 func (c *Crawler) getCoverCodeList(files []string) (coverList []coverCode) { for _, file := range files { // 去除域名部分 if strings.IndexRune(file, '@') > 0 { file = strings.Split(file, "@")[1] } nameSlice := strings.Split(file, "-") if len(nameSlice) < 2 || len(nameSlice[0]) > 5 { continue } num := c.getCodeNum(nameSlice[1]) if num == 0 { continue } coverList = append(coverList, coverCode{ letters: strings.ToLower(nameSlice[0]), number: num, }) } return coverList } // 组装封面图片地址 func (c *Crawler) getCoverImgUrl(code coverCode) string { codePrestige := c.config.GetStringSlice("crawler.code.prestige") for _, pCode := range codePrestige { if code.letters == pCode { url := strings.ReplaceAll(c.config.GetString("crawler.url.prestige"), `*`, code.letters) url = strings.ReplaceAll(url, `#`, fmt.Sprintf("%03d", code.number)) return url } } format := "%s%05d" if len(code.letters) > 4 { format = "1%s%05d" } codeStr := fmt.Sprintf(format, code.letters, code.number) return strings.ReplaceAll(c.config.GetString("crawler.url.other"), `*`, codeStr) } // 获取封面图片 func (c *Crawler) fetchCoverImg(code coverCode) error { if len(code.letters) < 2 || code.number < 1 { return nil } imgUrl := c.getCoverImgUrl(code) suffix := filepath.Ext(imgUrl) fileName := fmt.Sprintf("%s-%03d%s", strings.ToUpper(code.letters), code.number, suffix, ) filePath := filepath.Join(c.outputPath, fileName) req, err := http.NewRequest(http.MethodGet, imgUrl, nil) if err != nil { return fmt.Errorf("创建请求失败: %w", err) } // 模拟浏览器请求 userAgents := []string{ "Mozilla/5.0 (Windows NT 10.0; Win64; x64)", "AppleWebKit/537.36 (KHTML, like Gecko)", "Chrome/120.0.0.0 Safari/537.36", } req.Header.Set("User-Agent", userAgents[time.Now().UnixNano()%int64(len(userAgents))]) resp, err := c.client.Do(req) if err != nil { return fmt.Errorf("请求失败: %w", err) } defer resp.Body.Close() // 检查 HTTP 状态码 if resp.StatusCode != http.StatusOK { return nil } // 先完整读取 HTTP Body imgData, err := io.ReadAll(resp.Body) if err != nil { return fmt.Errorf("读取封面数据失败(%s): %w", fileName, err) } // 使用内存数据解码图片 img, _, err := image.DecodeConfig(bytes.NewReader(imgData)) if err != nil { return fmt.Errorf("图片解码失败: %w", err) } // 图片高度未到达配置最低值则抛弃 if img.Height < c.config.GetInt("crawler.minHeight") { return nil } // 将内存数据写入文件 if err := os.WriteFile(filePath, imgData, 0644); err != nil { return fmt.Errorf("文件写入失败: %w", err) } return nil } // 获取作品存放目录视频文件列表 func (c *Crawler) getAVPathVideoList() (videoFiles []string, err error) { // 用于去重 uniqueFiles := make(map[string]struct{}) if err := filepath.Walk(c.avPath, func(path string, info os.FileInfo, err error) error { if err != nil { return fmt.Errorf("访问路径 %s 失败: %w", path, err) } // 目录过滤 if info.IsDir() { return nil } // 仅处理视频文件 if !c.isVideoFile(info.Name()) { return nil } baseName := strings.TrimSuffix(info.Name(), filepath.Ext(info.Name())) // 获取文件哈希 fileHash, err := c.getFileInfo(path) if err != nil { return fmt.Errorf("获取文件哈希失败 %s: %w", baseName, err) } // 使用文件名+哈希值作为唯一标识 uniqueID := fmt.Sprintf("%s-%d", baseName, fileHash) if _, exists := uniqueFiles[uniqueID]; !exists { uniqueFiles[uniqueID] = struct{}{} videoFiles = append(videoFiles, baseName) } return nil }); err != nil { return nil, fmt.Errorf("作品存放目录遍历失败: %w", err) } return videoFiles, nil } // 获取输出目录已存在的封面列表 func (c *Crawler) getOutPathCoverList() (coverList []coverCode, err error) { // 用于去重 uniqueFiles := make(map[string]struct{}) if err := filepath.Walk(c.avPath, func(path string, info os.FileInfo, err error) error { if err != nil { return fmt.Errorf("访问路径 %s 失败: %w", path, err) } // 目录过滤 if info.IsDir() { return nil } // 仅处理图片文件 if !c.isJPGFile(info.Name()) { return nil } baseName := strings.TrimSuffix(info.Name(), filepath.Ext(info.Name())) if _, exists := uniqueFiles[baseName]; !exists { uniqueFiles[baseName] = struct{}{} nameSlice := strings.Split(baseName, "-") coverList = append(coverList, coverCode{ letters: strings.ToLower(nameSlice[0]), number: c.getCodeNum(nameSlice[1]), }) } return nil }); err != nil { return nil, fmt.Errorf("输出目录遍历失败: %w", err) } return coverList, nil } func (c *Crawler) Handle() error { if c.avPath == "未选择" || c.outputPath == "未选择" { return errors.New("请选择作品存放目录或输出目录") } videoFiles, err := c.getAVPathVideoList() if err != nil { return err } coverList := c.getCoverCodeList(videoFiles) existCovers, err := c.getOutPathCoverList() if err != nil { return err } // 过滤已存在的封面 if len(existCovers) > 0 { // 创建哈希表用于快速查找 existMap := make(map[coverCode]struct{}) for _, c := range existCovers { existMap[c] = struct{}{} } // 创建新切片过滤已存在项 filtered := make([]coverCode, 0, len(coverList)) for _, item := range coverList { if _, exists := existMap[item]; !exists { filtered = append(filtered, item) } } coverList = filtered } var g errgroup.Group for _, cover := range coverList { g.Go(func() error { return c.fetchCoverImg(cover) }) } return g.Wait() }