abot/utils/media_downloader.py

import asyncio
import os
import aiohttp
import aiofiles
import uuid
from typing import Optional
from urllib.parse import urlparse
from loguru import logger
import time

class MediaDownloader:
    """媒体下载工具类，用于下载图片等媒体文件"""

    def __init__(self, download_dir: str = None):
        """
        初始化下载器

        Args:
            download_dir: 下载目录，默认为项目下的 media_downloads 目录
        """
        self.LOG = logger
        self.download_dir = download_dir or os.path.join(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
            "media_downloads"
        )
        os.makedirs(self.download_dir, exist_ok=True)
        self.LOG.info(f"媒体下载目录: {self.download_dir}")

    async def download_media(self, url: str, file_type: str = None) -> Optional[str]:
        """
        下载媒体文件

        Args:
            url: 媒体文件URL
            file_type: 文件类型(如'jpg','png'等)，如果不指定则从URL中推断

        Returns:
            下载文件的本地绝对路径，如果下载失败则返回None
        """
        try:
            # 从URL获取文件名和扩展名
            parsed_url = urlparse(url)
            filename = os.path.basename(parsed_url.path)

            # 如果没有文件名或扩展名，则生成一个随机文件名
            if not filename or '.' not in filename:
                ext = file_type if file_type else await self._guess_file_type(url)
                filename = f"{uuid.uuid4().hex}.{ext}" if ext else f"{uuid.uuid4().hex}"

            local_path = os.path.join(self.download_dir, filename)

            self.LOG.info(f"开始下载媒体文件: {url} -> {local_path}")

            # 下载文件
            async with aiohttp.ClientSession() as session:
                async with session.get(url, timeout=30) as response:
                    response.raise_for_status()

                    async with aiofiles.open(local_path, 'wb') as f:
                        async for chunk in response.content.iter_chunked(8192):
                            if chunk:
                                await f.write(chunk)

            self.LOG.info(f"媒体文件下载成功: {local_path}")

            # 下载成功后清理旧文件
            await self.clear_downloads()

            return os.path.abspath(local_path)

        except Exception as e:
            self.LOG.error(f"下载媒体文件失败: {url}, 错误: {str(e)}")
            return None

    async def _guess_file_type(self, url: str) -> Optional[str]:
        """
        从URL推断文件类型

        Args:
            url: 媒体文件URL

        Returns:
            文件扩展名(不含点)，如果无法推断则返回None
        """
        try:
            parsed_url = urlparse(url)
            path = parsed_url.path.lower()

            if path.endswith('.jpg') or path.endswith('.jpeg'):
                return 'jpg'
            elif path.endswith('.png'):
                return 'png'
            elif path.endswith('.gif'):
                return 'gif'
            elif path.endswith('.mp4'):
                return 'mp4'
            elif path.endswith('.mp3'):
                return 'mp3'
            elif path.endswith('.pdf'):
                return 'pdf'
            else:
                # 检查Content-Type
                async with aiohttp.ClientSession() as session:
                    async with session.head(url, timeout=5) as response:
                        content_type = response.headers.get('Content-Type', '')

                        if 'image/jpeg' in content_type:
                            return 'jpg'
                        elif 'image/png' in content_type:
                            return 'png'
                        elif 'image/gif' in content_type:
                            return 'gif'

            return None
        except:
            return None

    async def clear_downloads(self, max_age_days: int = 3) -> None:
        """
        清理超过指定天数的下载文件

        Args:
            max_age_days: 最大保留天数，默认为3天
        """
        try:
            current_time = time.time()
            max_age_seconds = max_age_days * 24 * 60 * 60
            cleared_count = 0

            # 遍历下载目录中的所有文件
            for filename in os.listdir(self.download_dir):
                file_path = os.path.join(self.download_dir, filename)

                # 检查是否为文件
                if os.path.isfile(file_path):
                    # 获取文件最后修改时间
                    file_mtime = os.path.getmtime(file_path)
                    file_age = current_time - file_mtime

                    # 如果文件超过最大保留时间，则删除
                    if file_age > max_age_seconds:
                        try:
                            await asyncio.to_thread(os.remove, file_path)
                            cleared_count += 1
                            self.LOG.debug(f"已删除过期文件: {file_path}")
                        except Exception as e:
                            self.LOG.error(f"删除文件失败 {file_path}: {str(e)}")

            if cleared_count > 0:
                self.LOG.info(f"清理完成，共删除 {cleared_count} 个过期文件")

        except Exception as e:
            self.LOG.error(f"清理下载文件时出错: {str(e)}")