2025-03-01 22:12:29 +08:00
|
|
|
|
package main
|
|
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
|
"bytes"
|
|
|
|
|
|
"errors"
|
|
|
|
|
|
"fmt"
|
|
|
|
|
|
"github.com/spf13/viper"
|
2025-03-02 21:12:10 +08:00
|
|
|
|
"golang.org/x/sync/errgroup"
|
2025-03-04 14:52:18 +08:00
|
|
|
|
"hash/crc32"
|
2025-03-01 22:12:29 +08:00
|
|
|
|
"image"
|
|
|
|
|
|
_ "image/jpeg"
|
|
|
|
|
|
"io"
|
|
|
|
|
|
"net/http"
|
|
|
|
|
|
"os"
|
|
|
|
|
|
"path/filepath"
|
|
|
|
|
|
"strconv"
|
|
|
|
|
|
"strings"
|
2025-03-04 16:53:51 +08:00
|
|
|
|
"time"
|
2025-03-01 22:12:29 +08:00
|
|
|
|
"unicode"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
type Crawler struct {
|
|
|
|
|
|
avPath string
|
|
|
|
|
|
outputPath string
|
|
|
|
|
|
config *viper.Viper
|
2025-03-04 16:53:51 +08:00
|
|
|
|
client *http.Client
|
2025-03-01 22:12:29 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-03-04 21:23:53 +08:00
|
|
|
|
type coverCode struct {
|
|
|
|
|
|
letters string
|
|
|
|
|
|
number int
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-03-01 22:12:29 +08:00
|
|
|
|
func NewCrawler(avPath, outputPath string) *Crawler {
|
|
|
|
|
|
config := viper.New()
|
|
|
|
|
|
config.SetConfigName("config")
|
|
|
|
|
|
config.AddConfigPath(".")
|
|
|
|
|
|
config.SetConfigType("yaml")
|
|
|
|
|
|
|
|
|
|
|
|
if err := config.ReadInConfig(); err != nil {
|
|
|
|
|
|
fmt.Printf("读取配置文件发生错误, %s", err)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return &Crawler{
|
|
|
|
|
|
avPath: avPath,
|
|
|
|
|
|
outputPath: outputPath,
|
|
|
|
|
|
config: config,
|
2025-03-04 16:53:51 +08:00
|
|
|
|
client: &http.Client{
|
|
|
|
|
|
Timeout: 15 * time.Second,
|
|
|
|
|
|
Transport: &http.Transport{
|
|
|
|
|
|
MaxIdleConns: 10,
|
|
|
|
|
|
IdleConnTimeout: 30 * time.Second,
|
|
|
|
|
|
DisableCompression: true,
|
|
|
|
|
|
},
|
|
|
|
|
|
},
|
2025-03-01 22:12:29 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 检查文件是否是视频文件
|
|
|
|
|
|
func (c *Crawler) isVideoFile(fileName string) bool {
|
|
|
|
|
|
videoExtensions := c.config.GetStringSlice("crawler.video")
|
|
|
|
|
|
ext := strings.ToLower(filepath.Ext(fileName))
|
|
|
|
|
|
for _, videoExt := range videoExtensions {
|
|
|
|
|
|
if ext == videoExt {
|
|
|
|
|
|
return true
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return false
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-03-07 19:33:12 +08:00
|
|
|
|
// 检查文件是否是 JPG 文件
|
|
|
|
|
|
func (c *Crawler) isJPGFile(fileName string) bool {
|
|
|
|
|
|
ext := strings.ToLower(filepath.Ext(fileName))
|
|
|
|
|
|
return ext == ".jpg"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-03-04 14:52:18 +08:00
|
|
|
|
// 获取文件 CRC32 哈希值
|
|
|
|
|
|
func (c *Crawler) getFileInfo(filePath string) (uint32, error) {
|
|
|
|
|
|
file, err := os.Open(filePath)
|
2025-03-01 22:12:29 +08:00
|
|
|
|
if err != nil {
|
2025-03-04 14:52:18 +08:00
|
|
|
|
return 0, err
|
2025-03-01 22:12:29 +08:00
|
|
|
|
}
|
2025-03-04 14:52:18 +08:00
|
|
|
|
defer file.Close()
|
|
|
|
|
|
|
|
|
|
|
|
// 使用快速 CRC32 校验,仅读取前 4KB 内容
|
|
|
|
|
|
buf := make([]byte, 4096)
|
|
|
|
|
|
n, err := file.Read(buf)
|
|
|
|
|
|
if err != nil && err != io.EOF {
|
|
|
|
|
|
return 0, err
|
|
|
|
|
|
}
|
|
|
|
|
|
return crc32.ChecksumIEEE(buf[:n]), nil
|
2025-03-01 22:12:29 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 获取代码数字
|
|
|
|
|
|
func (c *Crawler) getCodeNum(s string) int {
|
|
|
|
|
|
runes := []rune(s)
|
|
|
|
|
|
if len(runes) < 3 {
|
|
|
|
|
|
return 0
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for i := 0; i < 3; i++ {
|
|
|
|
|
|
if !unicode.IsDigit(runes[i]) {
|
|
|
|
|
|
return 0
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
num, _ := strconv.Atoi(string(runes[:3]))
|
|
|
|
|
|
return num
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 获取封面代码列表
|
2025-03-04 21:23:53 +08:00
|
|
|
|
func (c *Crawler) getCoverCodeList(files []string) (coverList []coverCode) {
|
2025-03-01 22:12:29 +08:00
|
|
|
|
for _, file := range files {
|
2025-03-02 07:52:53 +08:00
|
|
|
|
// 去除域名部分
|
|
|
|
|
|
if strings.IndexRune(file, '@') > 0 {
|
|
|
|
|
|
file = strings.Split(file, "@")[1]
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-03-01 22:12:29 +08:00
|
|
|
|
nameSlice := strings.Split(file, "-")
|
|
|
|
|
|
if len(nameSlice) < 2 || len(nameSlice[0]) > 5 {
|
|
|
|
|
|
continue
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
num := c.getCodeNum(nameSlice[1])
|
|
|
|
|
|
if num == 0 {
|
|
|
|
|
|
continue
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-03-04 21:23:53 +08:00
|
|
|
|
coverList = append(coverList, coverCode{
|
|
|
|
|
|
letters: strings.ToLower(nameSlice[0]),
|
|
|
|
|
|
number: num,
|
|
|
|
|
|
})
|
2025-03-01 22:12:29 +08:00
|
|
|
|
}
|
2025-03-04 14:17:30 +08:00
|
|
|
|
return coverList
|
2025-03-01 22:12:29 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-03-07 16:26:34 +08:00
|
|
|
|
// 组装封面图片地址
|
|
|
|
|
|
func (c *Crawler) getCoverImgUrl(code coverCode) string {
|
|
|
|
|
|
codePrestige := c.config.GetStringSlice("crawler.code.prestige")
|
|
|
|
|
|
for _, pCode := range codePrestige {
|
|
|
|
|
|
if code.letters == pCode {
|
|
|
|
|
|
url := strings.ReplaceAll(c.config.GetString("crawler.url.prestige"), `*`, code.letters)
|
|
|
|
|
|
url = strings.ReplaceAll(url, `#`, fmt.Sprintf("%03d", code.number))
|
|
|
|
|
|
return url
|
|
|
|
|
|
}
|
2025-03-03 17:00:53 +08:00
|
|
|
|
}
|
2025-03-04 17:10:08 +08:00
|
|
|
|
|
2025-03-04 21:23:53 +08:00
|
|
|
|
format := "%s%05d"
|
|
|
|
|
|
if len(code.letters) > 4 {
|
|
|
|
|
|
format = "1%s%05d"
|
2025-03-03 17:00:53 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-03-04 21:23:53 +08:00
|
|
|
|
codeStr := fmt.Sprintf(format, code.letters, code.number)
|
2025-03-07 16:26:34 +08:00
|
|
|
|
return strings.ReplaceAll(c.config.GetString("crawler.url.other"), `*`, codeStr)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 获取封面图片
|
|
|
|
|
|
func (c *Crawler) fetchCoverImg(code coverCode) error {
|
|
|
|
|
|
if len(code.letters) < 2 || code.number < 1 {
|
|
|
|
|
|
return nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
imgUrl := c.getCoverImgUrl(code)
|
2025-03-04 21:23:53 +08:00
|
|
|
|
suffix := filepath.Ext(imgUrl)
|
2025-03-07 16:26:34 +08:00
|
|
|
|
fileName := fmt.Sprintf("%s-%03d%s",
|
2025-03-04 21:23:53 +08:00
|
|
|
|
strings.ToUpper(code.letters),
|
|
|
|
|
|
code.number,
|
2025-03-01 22:12:29 +08:00
|
|
|
|
suffix,
|
2025-03-07 16:26:34 +08:00
|
|
|
|
)
|
|
|
|
|
|
filePath := filepath.Join(c.outputPath, fileName)
|
2025-03-01 22:12:29 +08:00
|
|
|
|
|
2025-03-07 19:33:12 +08:00
|
|
|
|
req, err := http.NewRequest(http.MethodGet, imgUrl, nil)
|
2025-03-01 22:12:29 +08:00
|
|
|
|
if err != nil {
|
2025-03-03 19:38:42 +08:00
|
|
|
|
return fmt.Errorf("创建请求失败: %w", err)
|
2025-03-01 22:12:29 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 模拟浏览器请求
|
2025-03-04 16:53:51 +08:00
|
|
|
|
userAgents := []string{
|
|
|
|
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
|
|
|
|
|
|
"AppleWebKit/537.36 (KHTML, like Gecko)",
|
|
|
|
|
|
"Chrome/120.0.0.0 Safari/537.36",
|
|
|
|
|
|
}
|
|
|
|
|
|
req.Header.Set("User-Agent", userAgents[time.Now().UnixNano()%int64(len(userAgents))])
|
2025-03-01 22:12:29 +08:00
|
|
|
|
|
2025-03-04 16:53:51 +08:00
|
|
|
|
resp, err := c.client.Do(req)
|
2025-03-01 22:12:29 +08:00
|
|
|
|
if err != nil {
|
2025-03-03 19:38:42 +08:00
|
|
|
|
return fmt.Errorf("请求失败: %w", err)
|
2025-03-01 22:12:29 +08:00
|
|
|
|
}
|
|
|
|
|
|
defer resp.Body.Close()
|
|
|
|
|
|
|
|
|
|
|
|
// 检查 HTTP 状态码
|
|
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
|
|
|
|
return nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-03-03 19:38:42 +08:00
|
|
|
|
// 先完整读取 HTTP Body
|
2025-03-01 22:12:29 +08:00
|
|
|
|
imgData, err := io.ReadAll(resp.Body)
|
|
|
|
|
|
if err != nil {
|
2025-03-07 16:26:34 +08:00
|
|
|
|
return fmt.Errorf("读取封面数据失败(%s): %w", fileName, err)
|
2025-03-01 22:12:29 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-03-03 19:38:42 +08:00
|
|
|
|
// 使用内存数据解码图片
|
|
|
|
|
|
img, _, err := image.DecodeConfig(bytes.NewReader(imgData))
|
2025-03-01 22:12:29 +08:00
|
|
|
|
if err != nil {
|
2025-03-03 19:38:42 +08:00
|
|
|
|
return fmt.Errorf("图片解码失败: %w", err)
|
2025-03-01 22:12:29 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 图片高度未到达配置最低值则抛弃
|
|
|
|
|
|
if img.Height < c.config.GetInt("crawler.minHeight") {
|
|
|
|
|
|
return nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-03-03 19:38:42 +08:00
|
|
|
|
// 将内存数据写入文件
|
2025-03-07 16:26:34 +08:00
|
|
|
|
if err := os.WriteFile(filePath, imgData, 0644); err != nil {
|
2025-03-03 19:38:42 +08:00
|
|
|
|
return fmt.Errorf("文件写入失败: %w", err)
|
2025-03-01 22:12:29 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-03-07 19:33:12 +08:00
|
|
|
|
// 获取作品存放目录视频文件列表
|
|
|
|
|
|
func (c *Crawler) getAVPathVideoList() (videoFiles []string, err error) {
|
|
|
|
|
|
// 用于去重
|
2025-03-01 22:12:29 +08:00
|
|
|
|
uniqueFiles := make(map[string]struct{})
|
|
|
|
|
|
if err := filepath.Walk(c.avPath, func(path string, info os.FileInfo, err error) error {
|
|
|
|
|
|
if err != nil {
|
2025-03-04 14:17:30 +08:00
|
|
|
|
return fmt.Errorf("访问路径 %s 失败: %w", path, err)
|
2025-03-01 22:12:29 +08:00
|
|
|
|
}
|
2025-03-04 14:17:30 +08:00
|
|
|
|
|
2025-03-04 14:52:18 +08:00
|
|
|
|
// 目录过滤
|
|
|
|
|
|
if info.IsDir() {
|
|
|
|
|
|
return nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-03-01 22:12:29 +08:00
|
|
|
|
// 仅处理视频文件
|
2025-03-04 14:17:30 +08:00
|
|
|
|
if !c.isVideoFile(info.Name()) {
|
|
|
|
|
|
return nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
baseName := strings.TrimSuffix(info.Name(), filepath.Ext(info.Name()))
|
|
|
|
|
|
|
2025-03-04 14:52:18 +08:00
|
|
|
|
// 获取文件哈希
|
|
|
|
|
|
fileHash, err := c.getFileInfo(path)
|
2025-03-04 14:17:30 +08:00
|
|
|
|
if err != nil {
|
2025-03-04 14:52:18 +08:00
|
|
|
|
return fmt.Errorf("获取文件哈希失败 %s: %w", baseName, err)
|
2025-03-04 14:17:30 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-03-04 14:52:18 +08:00
|
|
|
|
// 使用文件名+哈希值作为唯一标识
|
|
|
|
|
|
uniqueID := fmt.Sprintf("%s-%d", baseName, fileHash)
|
2025-03-04 14:17:30 +08:00
|
|
|
|
if _, exists := uniqueFiles[uniqueID]; !exists {
|
|
|
|
|
|
uniqueFiles[uniqueID] = struct{}{}
|
|
|
|
|
|
videoFiles = append(videoFiles, baseName)
|
2025-03-01 22:12:29 +08:00
|
|
|
|
}
|
|
|
|
|
|
return nil
|
|
|
|
|
|
}); err != nil {
|
2025-03-07 19:33:12 +08:00
|
|
|
|
return nil, fmt.Errorf("作品存放目录遍历失败: %w", err)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return videoFiles, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 获取输出目录已存在的封面列表
|
|
|
|
|
|
func (c *Crawler) getOutPathCoverList() (coverList []coverCode, err error) {
|
|
|
|
|
|
// 用于去重
|
|
|
|
|
|
uniqueFiles := make(map[string]struct{})
|
|
|
|
|
|
if err := filepath.Walk(c.avPath, func(path string, info os.FileInfo, err error) error {
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return fmt.Errorf("访问路径 %s 失败: %w", path, err)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 目录过滤
|
|
|
|
|
|
if info.IsDir() {
|
|
|
|
|
|
return nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 仅处理图片文件
|
|
|
|
|
|
if !c.isJPGFile(info.Name()) {
|
|
|
|
|
|
return nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
baseName := strings.TrimSuffix(info.Name(), filepath.Ext(info.Name()))
|
|
|
|
|
|
if _, exists := uniqueFiles[baseName]; !exists {
|
|
|
|
|
|
uniqueFiles[baseName] = struct{}{}
|
|
|
|
|
|
nameSlice := strings.Split(baseName, "-")
|
|
|
|
|
|
coverList = append(coverList, coverCode{
|
|
|
|
|
|
letters: strings.ToLower(nameSlice[0]),
|
|
|
|
|
|
number: c.getCodeNum(nameSlice[1]),
|
|
|
|
|
|
})
|
|
|
|
|
|
}
|
|
|
|
|
|
return nil
|
|
|
|
|
|
}); err != nil {
|
|
|
|
|
|
return nil, fmt.Errorf("输出目录遍历失败: %w", err)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return coverList, nil
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func (c *Crawler) Handle() error {
|
|
|
|
|
|
if c.avPath == "未选择" || c.outputPath == "未选择" {
|
|
|
|
|
|
return errors.New("请选择作品存放目录或输出目录")
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
videoFiles, err := c.getAVPathVideoList()
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return err
|
2025-03-01 22:12:29 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-03-04 14:17:30 +08:00
|
|
|
|
coverList := c.getCoverCodeList(videoFiles)
|
2025-03-07 19:33:12 +08:00
|
|
|
|
existCovers, err := c.getOutPathCoverList()
|
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
return err
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 过滤已存在的封面
|
|
|
|
|
|
if len(existCovers) > 0 {
|
|
|
|
|
|
// 创建哈希表用于快速查找
|
|
|
|
|
|
existMap := make(map[coverCode]struct{})
|
|
|
|
|
|
for _, c := range existCovers {
|
|
|
|
|
|
existMap[c] = struct{}{}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 创建新切片过滤已存在项
|
|
|
|
|
|
filtered := make([]coverCode, 0, len(coverList))
|
|
|
|
|
|
for _, item := range coverList {
|
|
|
|
|
|
if _, exists := existMap[item]; !exists {
|
|
|
|
|
|
filtered = append(filtered, item)
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
coverList = filtered
|
|
|
|
|
|
}
|
2025-03-01 22:12:29 +08:00
|
|
|
|
|
2025-03-02 21:19:10 +08:00
|
|
|
|
var g errgroup.Group
|
2025-03-01 22:12:29 +08:00
|
|
|
|
for _, cover := range coverList {
|
2025-03-02 21:19:10 +08:00
|
|
|
|
g.Go(func() error {
|
2025-03-07 19:33:12 +08:00
|
|
|
|
return c.fetchCoverImg(cover)
|
2025-03-02 21:12:10 +08:00
|
|
|
|
})
|
2025-03-01 22:12:29 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-03-02 21:19:10 +08:00
|
|
|
|
return g.Wait()
|
2025-03-01 22:12:29 +08:00
|
|
|
|
}
|