feat: TTS 服务功能完善(任务管理、配置优化、客户端实现)

This commit is contained in:
2026-05-26 20:49:58 +08:00
parent 2d7776dd4d
commit c289097ca0
11 changed files with 307 additions and 39 deletions
@@ -61,8 +61,11 @@ public class TtsController {
@GetMapping("/tasks/by-source")
public Result<TtsTaskResponse> bySource(@Parameter(description = "来源类型") @RequestParam String sourceType,
@Parameter(description = "来源 ID") @RequestParam String sourceId,
@Parameter(description = "音色") @RequestParam(required = false) String voice) {
return Result.success(ttsTaskService.getBySource(sourceType, sourceId, voice));
@Parameter(description = "音色") @RequestParam(required = false) String voice,
@Parameter(description = "语速") @RequestParam(required = false) Double speechRate,
@Parameter(description = "音调") @RequestParam(required = false) Double pitch,
@Parameter(description = "情绪") @RequestParam(required = false) String emotion) {
return Result.success(ttsTaskService.getBySource(sourceType, sourceId, voice, speechRate, pitch, emotion));
}
@Operation(summary = "获取音频文件", description = "返回已合成的音频音频文件(MP3 或 WAV 格式)。")
@@ -3,6 +3,8 @@ package com.emotion.dto.request.tts;
import lombok.Data;
import javax.validation.constraints.NotBlank;
import javax.validation.constraints.DecimalMax;
import javax.validation.constraints.DecimalMin;
import javax.validation.constraints.Size;
@Data
@@ -18,4 +20,15 @@ public class TtsTaskCreateRequest {
@Size(max = 64)
private String voice;
@DecimalMin("0.60")
@DecimalMax("1.40")
private Double speechRate;
@DecimalMin("-20.00")
@DecimalMax("20.00")
private Double pitch;
@Size(max = 32)
private String emotion;
}
@@ -12,6 +12,9 @@ public class TtsTaskResponse {
private String sourceId;
private String status;
private String voice;
private Double speechRate;
private Double pitch;
private String emotion;
private String audioUrl;
private Long durationMs;
private String errorMessage;
@@ -2,7 +2,37 @@ package com.emotion.service;
public interface TtsEngineClient {
TtsEngineResult synthesize(String text, String voice, String outputPath);
TtsEngineResult synthesize(String text, String voice, String outputPath, SynthesisOptions options);
class SynthesisOptions {
private final Double speechRate;
private final Double pitch;
private final String emotion;
public SynthesisOptions(Double speechRate, Double pitch, String emotion) {
this.speechRate = speechRate;
this.pitch = pitch;
this.emotion = emotion;
}
public Double getSpeechRate() {
return speechRate;
}
public Double getPitch() {
return pitch;
}
public String getEmotion() {
return emotion;
}
public String cacheKey() {
return "rate=" + (speechRate == null ? "" : speechRate)
+ ";pitch=" + (pitch == null ? "" : pitch)
+ ";emotion=" + (emotion == null ? "" : emotion);
}
}
class TtsEngineResult {
private final boolean success;
@@ -11,5 +11,6 @@ public interface TtsTaskService extends IService<TtsTask> {
TtsTaskResponse getTask(String id);
TtsTaskResponse getBySource(String sourceType, String sourceId, String voice);
TtsTaskResponse getBySource(String sourceType, String sourceId, String voice,
Double speechRate, Double pitch, String emotion);
}
@@ -5,7 +5,9 @@ import org.springframework.beans.factory.annotation.Value;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Service;
import org.springframework.web.client.RestTemplate;
import org.springframework.util.StringUtils;
import java.util.HashMap;
import java.util.Map;
@Service
@@ -21,13 +23,23 @@ public class HttpTtsEngineClient implements TtsEngineClient {
}
@Override
public TtsEngineResult synthesize(String text, String voice, String outputPath) {
public TtsEngineResult synthesize(String text, String voice, String outputPath, SynthesisOptions options) {
try {
Map<String, Object> body = Map.of(
"text", text,
"voice", voice,
"outputPath", outputPath
);
Map<String, Object> body = new HashMap<>();
body.put("text", text);
body.put("voice", voice);
body.put("outputPath", outputPath);
if (options != null) {
if (options.getSpeechRate() != null) {
body.put("speechRate", options.getSpeechRate());
}
if (options.getPitch() != null) {
body.put("pitch", options.getPitch());
}
if (StringUtils.hasText(options.getEmotion())) {
body.put("emotion", options.getEmotion());
}
}
ResponseEntity<Map> response = restTemplate.postForEntity(engineUrl + "/synthesize", body, Map.class);
Map<?, ?> data = response.getBody();
boolean success = data != null && Boolean.TRUE.equals(data.get("success"));
@@ -19,6 +19,9 @@ import org.springframework.util.DigestUtils;
import org.springframework.util.StringUtils;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Executor;
@@ -32,6 +35,10 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
private static final String STATUS_PROCESSING = "processing";
private static final String STATUS_SUCCESS = "success";
private static final String STATUS_FAILED = "failed";
private static final double FALLBACK_SPEECH_RATE = 0.92D;
private static final double FALLBACK_PITCH = 0D;
private static final String FALLBACK_EMOTION = "story";
private static final int NATURAL_PARAGRAPH_LIMIT = 140;
private final EpicScriptMapper epicScriptMapper;
private final TtsEngineClient ttsEngineClient;
@@ -52,6 +59,15 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
@Value("${emotion.tts.default-voice:default_zh_female}")
private String defaultVoice;
@Value("${emotion.tts.default-speech-rate:0.92}")
private double defaultSpeechRate;
@Value("${emotion.tts.default-pitch:0}")
private double defaultPitch;
@Value("${emotion.tts.default-emotion:story}")
private String defaultEmotion;
public TtsTaskServiceImpl(EpicScriptMapper epicScriptMapper,
TtsEngineClient ttsEngineClient,
@Qualifier("taskExecutor") Executor taskExecutor) {
@@ -70,15 +86,16 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
String sourceType = normalizeSourceType(request.getSourceType());
String sourceId = request.getSourceId().trim();
String voice = resolveVoice(request.getVoice());
TtsEngineClient.SynthesisOptions options = resolveOptions(request);
String cleaned = cleanText(loadSourceText(userId, sourceType, sourceId));
if (!StringUtils.hasText(cleaned)) {
throw new IllegalArgumentException("Source text is empty");
}
if (cleaned.length() > maxTextLength) {
cleaned = cleaned.substring(0, maxTextLength);
cleaned = limitReadableText(cleaned, maxTextLength);
}
String hash = DigestUtils.md5DigestAsHex((voice + "\n" + cleaned).getBytes(StandardCharsets.UTF_8));
String hash = DigestUtils.md5DigestAsHex((voice + "\n" + options.cacheKey() + "\n" + cleaned).getBytes(StandardCharsets.UTF_8));
TtsTask owned = findOwnedTask(userId, sourceType, sourceId, voice, hash);
if (owned != null) {
incrementRequestCount(owned);
@@ -100,7 +117,7 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
TtsTask task = buildTask(userId, sourceType, sourceId, voice, hash, cleaned.length());
save(task);
String synthesisText = cleaned;
CompletableFuture.runAsync(() -> process(task.getId(), synthesisText, voice, task.getAudioPath()), taskExecutor);
CompletableFuture.runAsync(() -> process(task.getId(), synthesisText, voice, task.getAudioPath(), options), taskExecutor);
return toResponse(task);
}
@@ -115,20 +132,22 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
}
@Override
public TtsTaskResponse getBySource(String sourceType, String sourceId, String voice) {
public TtsTaskResponse getBySource(String sourceType, String sourceId, String voice,
Double speechRate, Double pitch, String emotion) {
String userId = currentUserId();
TtsTask task = getOne(new LambdaQueryWrapper<TtsTask>()
.eq(TtsTask::getUserId, userId)
.eq(TtsTask::getSourceType, normalizeSourceType(sourceType))
.eq(TtsTask::getSourceId, sourceId)
.eq(TtsTask::getVoice, resolveVoice(voice))
.eq(TtsTask::getIsDeleted, 0)
.orderByDesc(TtsTask::getCreateTime)
.last("LIMIT 1"));
String normalizedSourceType = normalizeSourceType(sourceType);
String normalizedVoice = resolveVoice(voice);
TtsEngineClient.SynthesisOptions options = resolveOptions(speechRate, pitch, emotion);
String cleaned = cleanText(loadSourceText(userId, normalizedSourceType, sourceId));
if (cleaned.length() > maxTextLength) {
cleaned = limitReadableText(cleaned, maxTextLength);
}
String hash = DigestUtils.md5DigestAsHex((normalizedVoice + "\n" + options.cacheKey() + "\n" + cleaned).getBytes(StandardCharsets.UTF_8));
TtsTask task = findOwnedTask(userId, normalizedSourceType, sourceId, normalizedVoice, hash);
return task == null ? null : toResponse(task);
}
private void process(String taskId, String text, String voice, String outputPath) {
private void process(String taskId, String text, String voice, String outputPath, TtsEngineClient.SynthesisOptions options) {
try {
TtsTask task = getById(taskId);
if (task == null) {
@@ -138,7 +157,7 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
task.setErrorMessage(null);
updateById(task);
TtsEngineClient.TtsEngineResult result = ttsEngineClient.synthesize(text, voice, outputPath);
TtsEngineClient.TtsEngineResult result = ttsEngineClient.synthesize(text, voice, outputPath, options);
task = getById(taskId);
if (task == null) {
return;
@@ -220,13 +239,15 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
StringBuilder text = new StringBuilder();
append(text, script.getTitle());
Map<String, Object> plotJson = script.getPlotJson();
Object fullContent = plotJson == null ? null : plotJson.get("fullContent");
if (fullContent != null && StringUtils.hasText(String.valueOf(fullContent))) {
append(text, String.valueOf(fullContent));
} else {
append(text, script.getPlotIntro());
append(text, script.getPlotTurning());
append(text, script.getPlotClimax());
append(text, script.getPlotEnding());
Map<String, Object> plotJson = script.getPlotJson();
if (plotJson != null && plotJson.get("fullContent") != null) {
append(text, String.valueOf(plotJson.get("fullContent")));
}
return text.toString();
}
@@ -235,9 +256,39 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
if (text == null) {
return "";
}
return text.replaceAll("[#>*_`\\-]", "")
.replaceAll("\\s+", " ")
.trim();
String normalized = text.replace("\r\n", "\n")
.replace('\r', '\n')
.replaceAll("!\\[[^\\]]*]\\([^)]*\\)", "")
.replaceAll("\\[([^\\]]+)]\\([^)]*\\)", "$1")
.replaceAll("(?m)^\\s{0,3}#{1,6}\\s*", "")
.replaceAll("(?m)^\\s*>\\s?", "")
.replaceAll("(?m)^\\s*[-*+]\\s+", "")
.replaceAll("(?m)^\\s*\\d+[.)、]\\s+", "")
.replaceAll("<[^>]+>", "")
.replaceAll("[*_`~]", "")
.replaceAll("[“”]", "\"")
.replaceAll("[‘’]", "'")
.replaceAll("\\.{3,}", "……")
.replaceAll("-{2,}", "")
.replaceAll("[\\t\\u00A0]+", " ")
.replaceAll(" {2,}", " ")
.replaceAll("(?<=[\\p{IsHan}])[ \\t\\u00A0]+(?=[\\p{IsHan}])", "")
.replaceAll("[ \\t\\u00A0]*([,。!?;:、,.!?;:])[ \\t\\u00A0]*", "$1")
.replaceAll(",", "")
.replaceAll("!", "")
.replaceAll("\\?", "")
.replaceAll(";", "")
.replaceAll(":", "");
List<String> paragraphs = new ArrayList<>();
for (String paragraph : normalized.split("\\n+")) {
String trimmed = paragraph.trim();
if (!StringUtils.hasText(trimmed)) {
continue;
}
paragraphs.addAll(toReadableParagraphs(trimmed));
}
return String.join("\n\n", paragraphs).trim();
}
private TtsTaskResponse toResponse(TtsTask task) {
@@ -253,6 +304,23 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
.build();
}
private TtsEngineClient.SynthesisOptions resolveOptions(TtsTaskCreateRequest request) {
return resolveOptions(request.getSpeechRate(), request.getPitch(), request.getEmotion());
}
private TtsEngineClient.SynthesisOptions resolveOptions(Double requestSpeechRate, Double requestPitch, String requestEmotion) {
double speechRate = requestSpeechRate == null ? defaultOrFallback(defaultSpeechRate, FALLBACK_SPEECH_RATE) : requestSpeechRate;
double pitch = requestPitch == null ? defaultPitch : requestPitch;
String emotion = StringUtils.hasText(requestEmotion)
? requestEmotion.trim()
: (StringUtils.hasText(defaultEmotion) ? defaultEmotion.trim() : FALLBACK_EMOTION);
return new TtsEngineClient.SynthesisOptions(
round(clamp(speechRate, 0.60D, 1.40D)),
round(clamp(pitch, -20D, 20D)),
emotion.toLowerCase(Locale.ROOT)
);
}
private String currentUserId() {
String userId = UserContextHolder.getCurrentUserId();
if (!StringUtils.hasText(userId)) {
@@ -275,6 +343,91 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
}
}
private static List<String> toReadableParagraphs(String paragraph) {
List<String> parts = new ArrayList<>();
StringBuilder current = new StringBuilder();
for (int index = 0; index < paragraph.length(); index++) {
char ch = paragraph.charAt(index);
current.append(ch);
if (isHardSentenceEnd(ch) || (current.length() >= NATURAL_PARAGRAPH_LIMIT && isSoftPause(ch))) {
addReadablePart(parts, current);
}
}
addReadablePart(parts, current);
return parts;
}
private static void addReadablePart(List<String> parts, StringBuilder current) {
String value = current.toString().trim();
current.setLength(0);
if (!StringUtils.hasText(value)) {
return;
}
if (value.length() > NATURAL_PARAGRAPH_LIMIT + 40) {
splitLongText(parts, value);
return;
}
parts.add(ensureSentenceEnding(value));
}
private static void splitLongText(List<String> parts, String value) {
StringBuilder chunk = new StringBuilder();
for (int index = 0; index < value.length(); index++) {
char ch = value.charAt(index);
chunk.append(ch);
if (chunk.length() >= NATURAL_PARAGRAPH_LIMIT) {
parts.add(ensureSentenceEnding(chunk.toString().trim()));
chunk.setLength(0);
}
}
if (chunk.length() > 0) {
parts.add(ensureSentenceEnding(chunk.toString().trim()));
}
}
private static String ensureSentenceEnding(String value) {
if (!StringUtils.hasText(value)) {
return "";
}
char last = value.charAt(value.length() - 1);
return isHardSentenceEnd(last) || isSoftPause(last) ? value : value + "";
}
private static boolean isHardSentenceEnd(char ch) {
return ch == '。' || ch == '' || ch == '' || ch == '' || ch == '…';
}
private static boolean isSoftPause(char ch) {
return ch == '' || ch == '、' || ch == '';
}
private static String limitReadableText(String text, int limit) {
if (text.length() <= limit) {
return text;
}
String truncated = text.substring(0, limit);
int cut = Math.max(
Math.max(truncated.lastIndexOf('。'), truncated.lastIndexOf('')),
Math.max(truncated.lastIndexOf(''), truncated.lastIndexOf('\n'))
);
if (cut > limit * 0.75) {
return truncated.substring(0, cut + 1).trim();
}
return ensureSentenceEnding(truncated.trim());
}
private static double clamp(double value, double min, double max) {
return Math.max(min, Math.min(max, value));
}
private static double round(double value) {
return Math.round(value * 100D) / 100D;
}
private static double defaultOrFallback(double value, double fallback) {
return value <= 0D ? fallback : value;
}
private static String joinPath(String prefix, String filename) {
if (prefix.endsWith("/")) {
return prefix + filename;
@@ -69,6 +69,9 @@ emotion:
public-url-prefix: /tts/audio
max-text-length: 5000
default-voice: default_zh_female
default-speech-rate: 0.92
default-pitch: 0
default-emotion: story
# Speech-to-text config
asr:
@@ -106,6 +106,9 @@ emotion:
public-url-prefix: /tts/audio
max-text-length: 5000
default-voice: default_zh_female
default-speech-rate: 0.92
default-pitch: 0
default-emotion: story
# Speech-to-text config
asr:
@@ -12,11 +12,19 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
class TtsTaskServiceTest {
@Test
@DisplayName("cleanText strips markdown and normalizes whitespace")
void cleanTextStripsMarkdownAndNormalizesWhitespace() {
String cleaned = TtsTaskServiceImpl.cleanText("# Title\n\n> **hello** `world` - ok");
@DisplayName("cleanText strips markdown but keeps Chinese narration rhythm")
void cleanTextStripsMarkdownButKeepsChineseNarrationRhythm() {
String cleaned = TtsTaskServiceImpl.cleanText("# 第一章\n\n> **她 终于** 看见了自己\n\n- 转身离开");
assertEquals("Title hello world ok", cleaned);
assertEquals("第一章。\n\n她终于看见了自己。\n\n转身离开。", cleaned);
}
@Test
@DisplayName("cleanText preserves sentence punctuation for natural pauses")
void cleanTextPreservesSentencePunctuationForNaturalPauses() {
String cleaned = TtsTaskServiceImpl.cleanText("他说: 这一次,我想自己选择!\n\n你听见了吗?");
assertEquals("他说:这一次,我想自己选择!\n\n你听见了吗?", cleaned);
}
@Test
@@ -28,6 +36,9 @@ class TtsTaskServiceTest {
@Test
@DisplayName("TtsEngineResult exposes synthesis result fields")
void ttsEngineResultExposesFields() {
TtsEngineClient.SynthesisOptions options = new TtsEngineClient.SynthesisOptions(0.92D, 0D, "story");
assertEquals("rate=0.92;pitch=0.0;emotion=story", options.cacheKey());
TtsEngineClient.TtsEngineResult result =
new TtsEngineClient.TtsEngineResult(true, "/tmp/a.mp3", 1200L, null);
+38 -2
View File
@@ -1,5 +1,6 @@
import subprocess
from pathlib import Path
from typing import Optional
from fastapi import FastAPI
from pydantic import BaseModel, Field
@@ -16,6 +17,42 @@ class SynthesizeRequest(BaseModel):
text: str = Field(min_length=1, max_length=5000)
voice: str = "default_zh_female"
outputPath: str
speechRate: Optional[float] = Field(default=0.92, ge=0.6, le=1.4)
pitch: Optional[float] = Field(default=0.0, ge=-20.0, le=20.0)
emotion: Optional[str] = "story"
def clamp(value: float, minimum: float, maximum: float) -> float:
return max(minimum, min(maximum, value))
def resolve_piper_args(request: SynthesizeRequest) -> list[str]:
speech_rate = clamp(float(request.speechRate or 0.92), 0.6, 1.4)
emotion = (request.emotion or "story").lower()
length_scale = round(1.0 / speech_rate, 2)
sentence_silence = 0.46
noise_scale = 0.64
noise_w = 0.72
if emotion in {"calm", "soft", "warm"}:
sentence_silence = 0.5
noise_scale = 0.58
noise_w = 0.68
elif emotion in {"story", "narration", "expressive"}:
sentence_silence = 0.48
noise_scale = 0.68
noise_w = 0.76
return [
"--sentence-silence",
str(sentence_silence),
"--length_scale",
str(length_scale),
"--noise_scale",
str(noise_scale),
"--noise_w",
str(noise_w),
]
@app.get("/health")
@@ -47,8 +84,7 @@ def synthesize(request: SynthesizeRequest):
str(PIPER_CONFIG),
"--output_file",
str(output),
"--sentence-silence",
"0.35",
*resolve_piper_args(request),
],
input=request.text,
text=True,