package main import ( "bytes" "context" "errors" "fmt" "github.com/spf13/viper" "golang.org/x/sync/errgroup" "hash/crc32" "image" _ "image/jpeg" "io" "net/http" "os" "path/filepath" "strconv" "strings" "time" "unicode" ) type Crawler struct { avPath string outputPath string config *viper.Viper client *http.Client } type coverCode struct { letters string number int } func NewCrawler(avPath, outputPath string) *Crawler { config := viper.New() config.SetConfigName("config") config.AddConfigPath(".") config.SetConfigType("yaml") if err := config.ReadInConfig(); err != nil { fmt.Printf("读取配置文件发生错误, %s", err) } return &Crawler{ avPath: avPath, outputPath: outputPath, config: config, client: &http.Client{ Timeout: 15 * time.Second, Transport: &http.Transport{ MaxIdleConns: 10, IdleConnTimeout: 30 * time.Second, DisableCompression: true, }, }, } } // 检查文件是否是视频文件 func (c *Crawler) isVideoFile(fileName string) bool { videoExtensions := c.config.GetStringSlice("crawler.video") ext := strings.ToLower(filepath.Ext(fileName)) for _, videoExt := range videoExtensions { if ext == videoExt { return true } } return false } // 获取文件 CRC32 哈希值 func (c *Crawler) getFileInfo(filePath string) (uint32, error) { file, err := os.Open(filePath) if err != nil { return 0, err } defer file.Close() // 使用快速 CRC32 校验,仅读取前 4KB 内容 buf := make([]byte, 4096) n, err := file.Read(buf) if err != nil && err != io.EOF { return 0, err } return crc32.ChecksumIEEE(buf[:n]), nil } // 获取代码数字 func (c *Crawler) getCodeNum(s string) int { runes := []rune(s) if len(runes) < 3 { return 0 } for i := 0; i < 3; i++ { if !unicode.IsDigit(runes[i]) { return 0 } } num, _ := strconv.Atoi(string(runes[:3])) return num } // 获取封面代码列表 func (c *Crawler) getCoverCodeList(files []string) (coverList []coverCode) { for _, file := range files { // 去除域名部分 if strings.IndexRune(file, '@') > 0 { file = strings.Split(file, "@")[1] } nameSlice := strings.Split(file, "-") if len(nameSlice) < 2 || len(nameSlice[0]) > 5 { continue } num := c.getCodeNum(nameSlice[1]) if num == 0 { continue } coverList = append(coverList, coverCode{ letters: strings.ToLower(nameSlice[0]), number: num, }) } return coverList } // 组装封面图片地址 func (c *Crawler) getCoverImgUrl(code coverCode) string { codePrestige := c.config.GetStringSlice("crawler.code.prestige") for _, pCode := range codePrestige { if code.letters == pCode { url := strings.ReplaceAll(c.config.GetString("crawler.url.prestige"), `*`, code.letters) url = strings.ReplaceAll(url, `#`, fmt.Sprintf("%03d", code.number)) return url } } format := "%s%05d" if len(code.letters) > 4 { format = "1%s%05d" } codeStr := fmt.Sprintf(format, code.letters, code.number) return strings.ReplaceAll(c.config.GetString("crawler.url.other"), `*`, codeStr) } // 获取封面图片 func (c *Crawler) fetchCoverImg(code coverCode) error { if len(code.letters) < 2 || code.number < 1 { return nil } imgUrl := c.getCoverImgUrl(code) suffix := filepath.Ext(imgUrl) fileName := fmt.Sprintf("%s-%03d%s", strings.ToUpper(code.letters), code.number, suffix, ) filePath := filepath.Join(c.outputPath, fileName) // 使用带超时的上下文 ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() req, err := http.NewRequestWithContext(ctx, "GET", imgUrl, nil) if err != nil { return fmt.Errorf("创建请求失败: %w", err) } // 模拟浏览器请求 userAgents := []string{ "Mozilla/5.0 (Windows NT 10.0; Win64; x64)", "AppleWebKit/537.36 (KHTML, like Gecko)", "Chrome/120.0.0.0 Safari/537.36", } req.Header.Set("User-Agent", userAgents[time.Now().UnixNano()%int64(len(userAgents))]) resp, err := c.client.Do(req) if err != nil { return fmt.Errorf("请求失败: %w", err) } defer resp.Body.Close() // 检查 HTTP 状态码 if resp.StatusCode != http.StatusOK { return nil } // 先完整读取 HTTP Body imgData, err := io.ReadAll(resp.Body) if err != nil { return fmt.Errorf("读取封面数据失败(%s): %w", fileName, err) } // 使用内存数据解码图片 img, _, err := image.DecodeConfig(bytes.NewReader(imgData)) if err != nil { return fmt.Errorf("图片解码失败: %w", err) } // 图片高度未到达配置最低值则抛弃 if img.Height < c.config.GetInt("crawler.minHeight") { return nil } // 将内存数据写入文件 if err := os.WriteFile(filePath, imgData, 0644); err != nil { return fmt.Errorf("文件写入失败: %w", err) } return nil } func (c *Crawler) Handle() error { if c.avPath == "未选择" || c.outputPath == "未选择" { return errors.New("请选择作品存放目录或输出目录") } // 用于去重的集合 uniqueFiles := make(map[string]struct{}) var videoFiles []string if err := filepath.Walk(c.avPath, func(path string, info os.FileInfo, err error) error { if err != nil { return fmt.Errorf("访问路径 %s 失败: %w", path, err) } // 目录过滤 if info.IsDir() { return nil } // 仅处理视频文件 if !c.isVideoFile(info.Name()) { return nil } baseName := strings.TrimSuffix(info.Name(), filepath.Ext(info.Name())) // 获取文件哈希 fileHash, err := c.getFileInfo(path) if err != nil { return fmt.Errorf("获取文件哈希失败 %s: %w", baseName, err) } // 使用文件名+哈希值作为唯一标识 uniqueID := fmt.Sprintf("%s-%d", baseName, fileHash) if _, exists := uniqueFiles[uniqueID]; !exists { uniqueFiles[uniqueID] = struct{}{} videoFiles = append(videoFiles, baseName) } return nil }); err != nil { return fmt.Errorf("目录遍历失败: %w", err) } coverList := c.getCoverCodeList(videoFiles) var g errgroup.Group for _, cover := range coverList { g.Go(func() error { if err := c.fetchCoverImg(cover); err != nil { return err } return nil }) } return g.Wait() }