rtc-voice-chat/backend/main.py

"""
Copyright 2025 Beijing Volcano Engine Technology Co., Ltd. All Rights Reserved.
SPDX-license-identifier: BSD-3-Clause

FastAPI backend — migrated from Server/app.js (Node.js + Koa)
"""

import json
import os
import time
import uuid
from pathlib import Path

import httpx
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse

from signer import Signer
from rtc_token import AccessToken, privileges

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)

SCENES_DIR = Path(__file__).parent / "scenes"


def load_scenes() -> dict:
    scenes = {}
    for p in SCENES_DIR.glob("*.json"):
        with open(p, encoding="utf-8") as f:
            scenes[p.stem] = json.load(f)
    return scenes


Scenes = load_scenes()


def assert_value(value, msg: str):
    if not value or (isinstance(value, str) and " " in value):
        raise ValueError(msg)


def error_response(action: str, message: str):
    return JSONResponse(
        {
            "ResponseMetadata": {
                "Action": action,
                "Error": {"Code": -1, "Message": message},
            }
        }
    )


@app.post("/proxy")
async def proxy(request: Request):
    action = request.query_params.get("Action", "")
    version = request.query_params.get("Version", "2024-12-01")

    try:
        assert_value(action, "Action 不能为空")
        assert_value(version, "Version 不能为空")

        body = await request.json()
        scene_id = body.get("SceneID", "")
        assert_value(scene_id, "SceneID 不能为空，SceneID 用于指定场景的 JSON")

        json_data = Scenes.get(scene_id)
        if not json_data:
            raise ValueError(
                f"{scene_id} 不存在，请先在 backend/scenes 下定义该场景的 JSON."
            )

        voice_chat = json_data.get("VoiceChat", {})
        account_config = json_data.get("AccountConfig", {})
        assert_value(
            account_config.get("accessKeyId"), "AccountConfig.accessKeyId 不能为空"
        )
        assert_value(
            account_config.get("secretKey"), "AccountConfig.secretKey 不能为空"
        )

        if action == "StartVoiceChat":
            req_body = voice_chat
        elif action == "StopVoiceChat":
            app_id = voice_chat.get("AppId", "")
            room_id = voice_chat.get("RoomId", "")
            task_id = voice_chat.get("TaskId", "")
            assert_value(app_id, "VoiceChat.AppId 不能为空")
            assert_value(room_id, "VoiceChat.RoomId 不能为空")
            assert_value(task_id, "VoiceChat.TaskId 不能为空")
            req_body = {"AppId": app_id, "RoomId": room_id, "TaskId": task_id}
        else:
            req_body = {}

        request_data = {
            "region": "cn-north-1",
            "method": "POST",
            "params": {"Action": action, "Version": version},
            "headers": {
                "Host": "rtc.volcengineapi.com",
                "Content-type": "application/json",
            },
            "body": req_body,
        }
        signer = Signer(request_data, "rtc")
        signer.add_authorization(account_config)

        async with httpx.AsyncClient() as client:
            resp = await client.post(
                f"https://rtc.volcengineapi.com?Action={action}&Version={version}",
                headers=request_data["headers"],
                json=req_body,
            )
        return JSONResponse(resp.json())

    except ValueError as e:
        return error_response(action, str(e))
    except Exception as e:
        return error_response(action, str(e))


@app.post("/getScenes")
async def get_scenes():
    try:
        scenes_list = []
        for scene_name, data in Scenes.items():
            scene_config = data.get("SceneConfig", {})
            rtc_config = data.get("RTCConfig", {})
            voice_chat = data.get("VoiceChat", {})

            app_id = rtc_config.get("AppId", "")
            assert_value(app_id, f"{scene_name} 场景的 RTCConfig.AppId 不能为空")

            token = rtc_config.get("Token", "")
            user_id = rtc_config.get("UserId", "")
            room_id = rtc_config.get("RoomId", "")
            app_key = rtc_config.get("AppKey", "")

            if app_id and (not token or not user_id or not room_id):
                rtc_config["RoomId"] = voice_chat["RoomId"] = room_id or str(
                    uuid.uuid4()
                )
                rtc_config["UserId"] = user_id = user_id or str(uuid.uuid4())
                if voice_chat.get("AgentConfig") and voice_chat["AgentConfig"].get(
                    "TargetUserId"
                ):
                    voice_chat["AgentConfig"]["TargetUserId"][0] = rtc_config["UserId"]

                assert_value(
                    app_key, f"自动生成 Token 时，{scene_name} 场景的 AppKey 不可为空"
                )
                key = AccessToken(
                    app_id, app_key, rtc_config["RoomId"], rtc_config["UserId"]
                )
                key.add_privilege(privileges["PrivSubscribeStream"], 0)
                key.add_privilege(privileges["PrivPublishStream"], 0)
                key.expire_time(int(time.time()) + 24 * 3600)
                rtc_config["Token"] = key.serialize()

            scene_config["id"] = scene_name
            scene_config["botName"] = voice_chat.get("AgentConfig", {}).get("UserId")
            scene_config["isInterruptMode"] = (
                voice_chat.get("Config", {}).get("InterruptMode") == 0
            )
            scene_config["isVision"] = (
                voice_chat.get("Config", {})
                .get("LLMConfig", {})
                .get("VisionConfig", {})
                .get("Enable")
            )
            scene_config["isScreenMode"] = (
                voice_chat.get("Config", {})
                .get("LLMConfig", {})
                .get("VisionConfig", {})
                .get("SnapshotConfig", {})
                .get("StreamType")
                == 1
            )
            scene_config["isAvatarScene"] = (
                voice_chat.get("Config", {}).get("AvatarConfig", {}).get("Enabled")
            )
            scene_config["avatarBgUrl"] = (
                voice_chat.get("Config", {})
                .get("AvatarConfig", {})
                .get("BackgroundUrl")
            )

            rtc_out = {k: v for k, v in rtc_config.items() if k != "AppKey"}

            scenes_list.append(
                {
                    "scene": scene_config,
                    "rtc": rtc_out,
                }
            )

        return JSONResponse(
            {
                "ResponseMetadata": {"Action": "getScenes"},
                "Result": {"scenes": scenes_list},
            }
        )

    except ValueError as e:
        return JSONResponse(
            {
                "ResponseMetadata": {
                    "Action": "getScenes",
                    "Error": {"Code": -1, "Message": str(e)},
                }
            }
        )


if __name__ == "__main__":
    import uvicorn

    uvicorn.run("main:app", host="0.0.0.0", port=3001, reload=True)