happy-life-star/backend-single/asr-service/app.py

import os
import sys
import tempfile
import time
import types
import importlib.machinery
from pathlib import Path
from threading import Lock

from fastapi import FastAPI, File, UploadFile

app = FastAPI(title="Emotion Museum ASR")

MODEL_NAME = os.getenv("ASR_MODEL", "/data/programs/emotion-museum/asr-service/models/paraformer-zh-onnx")
DEVICE = os.getenv("ASR_DEVICE", "cpu")
WORK_DIR = Path(os.getenv("ASR_WORK_DIR", "/tmp/emotion-museum-asr"))
WORK_DIR.mkdir(parents=True, exist_ok=True)

_model = None
_model_lock = Lock()


def get_model():
    global _model
    with _model_lock:
        if _model is None:
            # funasr-onnx imports the optional SenseVoice module from package
            # __init__, which imports torch even when we only use Paraformer.
            # This service intentionally runs the ONNX path without PyTorch.
            if "torch" not in sys.modules:
                torch_stub = types.ModuleType("torch")
                torch_stub.__spec__ = importlib.machinery.ModuleSpec("torch", loader=None)
                torch_stub.Tensor = type("Tensor", (), {})
                sys.modules["torch"] = torch_stub
            from funasr_onnx import Paraformer

            _model = Paraformer(
                MODEL_NAME,
                batch_size=1,
                device_id=-1,
                quantize=True,
                intra_op_num_threads=2,
            )
        return _model


def clean_text(text):
    if isinstance(text, (list, tuple)):
        text = text[0] if text else ""
    if not text:
        return ""
    markers = ["<|zh|>", "<|en|>", "<|yue|>", "<|ja|>", "<|ko|>", "<|nospeech|>", "<|withitn|>", "<|woitn|>"]
    for marker in markers:
        text = text.replace(marker, "")
    return text.strip()


@app.get("/health")
def health():
    return {
        "status": "ok",
        "engine": "funasr-onnx",
        "model": MODEL_NAME,
        "device": DEVICE,
        "loaded": _model is not None,
    }


@app.post("/transcribe")
async def transcribe(file: UploadFile = File(...)):
    started = time.time()
    suffix = Path(file.filename or "audio.wav").suffix or ".wav"
    tmp_path = None

    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix, dir=WORK_DIR) as tmp:
            tmp_path = Path(tmp.name)
            while True:
                chunk = await file.read(1024 * 1024)
                if not chunk:
                    break
                tmp.write(chunk)

        model = get_model()
        result = model([str(tmp_path)])
        first = result[0] if isinstance(result, list) and result else result
        text = clean_text(first.get("preds", first.get("text", "")) if isinstance(first, dict) else str(first or ""))
        language = first.get("language") if isinstance(first, dict) else None

        return {
            "success": bool(text),
            "text": text,
            "language": language,
            "durationMs": int((time.time() - started) * 1000),
            "engine": "funasr-onnx",
            "model": MODEL_NAME,
            "errorMessage": None if text else "empty recognition result",
        }
    except Exception as exc:
        return {
            "success": False,
            "text": "",
            "language": None,
            "durationMs": int((time.time() - started) * 1000),
            "engine": "funasr-onnx",
            "model": MODEL_NAME,
            "errorMessage": str(exc),
        }
    finally:
        if tmp_path:
            try:
                tmp_path.unlink(missing_ok=True)
            except Exception:
                pass