Skip to content

Qwen3-Omni-Captioner 音频理解 API 文档

Qwen3-Omni-Captioner是以通义千问3-Omni为基座的开源模型,无需任何提示,自动为复杂语音、环境声、音乐、影视声效等生成精准、全面的描述,能识别说话人的情绪、音乐元素(如风格、乐器)、敏感信息等,适用于音频内容分析、安全审核、意图识别、音频剪辑等多个领域。

📍 请求地址

https://www.dmxapi.cn/v1/chat/completions

🎯 模型名称

Qwen3-Omni-Captioner

💻 音频理解URL 调用示例

python
# ============================================================================
#                    Qwen3-Omni 音频理解示例 (URL方式)
# ============================================================================
# 功能说明:通过URL方式上传音频文件,使用Qwen3-Omni模型进行音频内容理解
# 模型名称:qwen3-omni-30b-a3b-captioner
# ============================================================================

import requests

# ----------------------------------------------------------------------------
#                              API 配置信息
# ----------------------------------------------------------------------------
# 接口地址:DMXAPI Chat Completions 端点
url = "https://www.dmxapi.cn/v1/chat/completions"

# 请求头配置
headers = {
    "Authorization": "sk-*****************************************",  # API密钥
    "Content-Type": "application/json"                                              # 内容类型
}

# ----------------------------------------------------------------------------
#                              请求体构建
# ----------------------------------------------------------------------------
data = {
    # 指定使用的模型
    "model": "qwen3-omni-30b-a3b-captioner",

    # 消息列表
    "messages": [
        {
            "role": "user",
            "content": [
                {
                    # 音频输入类型
                    "type": "input_audio",
                    "input_audio": {
                        # 音频文件URL地址(支持公网可访问的音频链接)
                        "data": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20240916/xvappi/装修噪音.wav"
                    }
                }
            ]
        }
    ]
}

# ----------------------------------------------------------------------------
#                              发送请求并输出结果
# ----------------------------------------------------------------------------
response = requests.post(url, headers=headers, json=data)
print(response.json())

🚗 音频理解URL(流式输出)调用示例

python
# ============================================================================
#              Qwen3-Omni 音频理解示例 (URL方式 - 流式输出)
# ============================================================================
# 功能说明:通过URL方式上传音频,使用流式输出实时获取模型响应
# 模型名称:qwen3-omni-30b-a3b-captioner
# ============================================================================

import requests
import json
import time

# ----------------------------------------------------------------------------
#                              API 配置信息
# ----------------------------------------------------------------------------
API_KEY = "sk-******************************************"  # API密钥
BASE_URL = "https://www.dmxapi.cn/v1/chat/completions"            # 接口地址

# ----------------------------------------------------------------------------
#                              请求头配置
# ----------------------------------------------------------------------------
headers = {
    "Authorization": f"{API_KEY}",  # 认证信息
    "Content-Type": "application/json"     # 内容类型
}

# ----------------------------------------------------------------------------
#                              请求体构建
# ----------------------------------------------------------------------------
data = {
    # 指定使用的模型
    "model": "qwen3-omni-30b-a3b-captioner",

    # 消息列表
    "messages": [
        {
            "role": "user",
            "content": [
                {
                    "type": "input_audio",      # 音频输入类型
                    "input_audio": {
                        # 音频文件URL地址
                        "data": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20240916/xvappi/%E8%A3%85%E4%BF%AE%E5%99%AA%E9%9F%B3.wav"
                    }
                }
            ]
        }
    ],

    # ------------------------------------------------------------------------
    #                           流式输出配置
    # ------------------------------------------------------------------------
    "stream": True,                 # 启用流式输出
    "stream_options": {
        "include_usage": True       # 在流式响应中包含Token用量统计
    }
}


# ============================================================================
#                              主函数
# ============================================================================
def main():
    """
    主函数:发送流式请求并处理响应
    功能包括:实时输出、首Token延迟计算、Token用量统计
    """
    print("开始请求...")

    # ------------------------------------------------------------------------
    #                           初始化计时与变量
    # ------------------------------------------------------------------------
    start_time = time.time()      # 记录请求开始时间
    first_token_time = None       # 首个Token到达时间
    full_response = ""            # 完整响应内容
    usage_info = None             # Token用量信息

    # ------------------------------------------------------------------------
    #                           发送流式请求
    # ------------------------------------------------------------------------
    try:
        response = requests.post(
            BASE_URL,
            headers=headers,
            json=data,
            stream=True               # 启用流式接收
        )
        response.raise_for_status()   # 检查HTTP状态码

        # --------------------------------------------------------------------
        #                       逐行处理流式响应
        # --------------------------------------------------------------------
        for line in response.iter_lines():
            if line:
                line = line.decode('utf-8')

                # 处理SSE格式数据(以"data: "开头)
                if line.startswith("data: "):
                    json_str = line[6:]           # 去掉 "data: " 前缀

                    # 检测流结束标志
                    if json_str == "[DONE]":
                        break

                    try:
                        chunk = json.loads(json_str)

                        # 提取并输出内容片段
                        if "choices" in chunk and len(chunk["choices"]) > 0:
                            delta = chunk["choices"][0].get("delta", {})
                            content = delta.get("content", "")

                            if content:
                                # 记录首Token时间
                                if first_token_time is None:
                                    first_token_time = time.time()
                                # 实时输出内容(不换行)
                                print(content, end="", flush=True)
                                full_response += content

                        # 提取Token用量信息
                        if "usage" in chunk:
                            usage_info = chunk["usage"]

                    except json.JSONDecodeError:
                        continue

        print("\n")

    # ------------------------------------------------------------------------
    #                           异常处理
    # ------------------------------------------------------------------------
    except requests.exceptions.RequestException as e:
        print(f"\n请求错误: {e}")
        return

    # ========================================================================
    #                           统计信息输出
    # ========================================================================
    end_time = time.time()
    total_time = end_time - start_time

    print("=" * 50)
    print("统计信息:")
    print(f"  总耗时: {total_time:.2f}秒")

    # 首Token延迟 (TTFT: Time To First Token)
    if first_token_time:
        ttft = first_token_time - start_time
        print(f"  首Token延迟: {ttft:.2f}秒")

    # Token用量详情
    if usage_info:
        prompt_tokens = usage_info.get("prompt_tokens", 0)
        completion_tokens = usage_info.get("completion_tokens", 0)
        total_tokens = usage_info.get("total_tokens", 0)

        print(f"  输入Tokens: {prompt_tokens}")
        print(f"  输出Tokens: {completion_tokens}")
        print(f"  总Tokens: {total_tokens}")

        # 计算输出速度
        if total_time > 0 and completion_tokens > 0:
            tokens_per_sec = completion_tokens / total_time
            print(f"  输出速度: {tokens_per_sec:.2f} tokens/秒")


# ============================================================================
#                              程序入口
# ============================================================================
if __name__ == "__main__":
    main()

🚀音频理解本地(base64)

python
# ============================================================================
#              Qwen3-Omni 音频理解示例 (本地文件 - Base64方式)
# ============================================================================
# 功能说明:读取本地音频文件,转换为Base64编码后发送给模型进行理解
# 模型名称:qwen3-omni-30b-a3b-captioner
# 适用场景:本地音频文件处理、无公网URL的音频分析
# ============================================================================

import base64
import requests

# ----------------------------------------------------------------------------
#                              API 配置信息
# ----------------------------------------------------------------------------
api_key = "sk-************************************************"  # API密钥
url = "https://www.dmxapi.cn/v1/chat/completions"                 # 接口地址

# ----------------------------------------------------------------------------
#                              音频文件配置
# ----------------------------------------------------------------------------
# 填写本地音频文件路径,支持 wav、mp3 等格式
# 留空则使用默认示例数据
file_path = "qwen/装修噪音.wav"  # 例如: "C:/audio/test.mp3"


# ============================================================================
#                              工具函数
# ============================================================================
def file_to_base64(path):
    """
    读取本地文件并转换为Base64编码字符串

    参数:
        path: 文件路径
    返回:
        Base64编码的字符串
    """
    with open(path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")


# ----------------------------------------------------------------------------
#                           音频数据准备
# ----------------------------------------------------------------------------
# 根据是否填写文件路径,自动转换本地文件或使用示例数据
if file_path:
    audio_data = f"data:;base64,{file_to_base64(file_path)}"
else:
    # 默认示例数据(Base64编码的音频片段)
    audio_data = "data:;base64,SUQzBAAAAAAAI1RTU0UAAAAPAAADTGF2ZjU4LjI5...."

# ----------------------------------------------------------------------------
#                              请求头配置
# ----------------------------------------------------------------------------
headers = {
    "Authorization": f"{api_key}",  # 认证信息
    "Content-Type": "application/json"     # 内容类型
}

# ----------------------------------------------------------------------------
#                              请求体构建
# ----------------------------------------------------------------------------
data = {
    # 指定使用的模型
    "model": "qwen3-omni-30b-a3b-captioner",

    # 消息列表
    "messages": [
        {
            "role": "user",
            "content": [
                {
                    "type": "input_audio",      # 音频输入类型
                    "input_audio": {
                        "data": audio_data      # Base64编码的音频数据
                    }
                }
            ]
        }
    ]
}

# ----------------------------------------------------------------------------
#                           发送请求并输出结果
# ----------------------------------------------------------------------------
response = requests.post(url, headers=headers, json=data)
print(response.json())

📚 阿里官方网站

https://help.aliyun.com/zh/model-studio/qwen3-omni-captioner

© 2025 DMXAPI Qwen3-Omni-Captioner音频理解

一个 Key 用全球大模型