feat: TTS 服务功能完善(任务管理、配置优化、客户端实现)

This commit is contained in:
2026-05-26 20:49:58 +08:00
parent 2d7776dd4d
commit c289097ca0
11 changed files with 307 additions and 39 deletions
@@ -61,8 +61,11 @@ public class TtsController {
@GetMapping("/tasks/by-source")
public Result<TtsTaskResponse> bySource(@Parameter(description = "来源类型") @RequestParam String sourceType,
@Parameter(description = "来源 ID") @RequestParam String sourceId,
@Parameter(description = "音色") @RequestParam(required = false) String voice) {
return Result.success(ttsTaskService.getBySource(sourceType, sourceId, voice));
@Parameter(description = "音色") @RequestParam(required = false) String voice,
@Parameter(description = "语速") @RequestParam(required = false) Double speechRate,
@Parameter(description = "音调") @RequestParam(required = false) Double pitch,
@Parameter(description = "情绪") @RequestParam(required = false) String emotion) {
return Result.success(ttsTaskService.getBySource(sourceType, sourceId, voice, speechRate, pitch, emotion));
}
@Operation(summary = "获取音频文件", description = "返回已合成的音频音频文件(MP3 或 WAV 格式)。")
@@ -3,6 +3,8 @@ package com.emotion.dto.request.tts;
import lombok.Data;
import javax.validation.constraints.NotBlank;
import javax.validation.constraints.DecimalMax;
import javax.validation.constraints.DecimalMin;
import javax.validation.constraints.Size;
@Data
@@ -18,4 +20,15 @@ public class TtsTaskCreateRequest {
@Size(max = 64)
private String voice;
@DecimalMin("0.60")
@DecimalMax("1.40")
private Double speechRate;
@DecimalMin("-20.00")
@DecimalMax("20.00")
private Double pitch;
@Size(max = 32)
private String emotion;
}
@@ -12,6 +12,9 @@ public class TtsTaskResponse {
private String sourceId;
private String status;
private String voice;
private Double speechRate;
private Double pitch;
private String emotion;
private String audioUrl;
private Long durationMs;
private String errorMessage;
@@ -2,7 +2,37 @@ package com.emotion.service;
public interface TtsEngineClient {
TtsEngineResult synthesize(String text, String voice, String outputPath);
TtsEngineResult synthesize(String text, String voice, String outputPath, SynthesisOptions options);
class SynthesisOptions {
private final Double speechRate;
private final Double pitch;
private final String emotion;
public SynthesisOptions(Double speechRate, Double pitch, String emotion) {
this.speechRate = speechRate;
this.pitch = pitch;
this.emotion = emotion;
}
public Double getSpeechRate() {
return speechRate;
}
public Double getPitch() {
return pitch;
}
public String getEmotion() {
return emotion;
}
public String cacheKey() {
return "rate=" + (speechRate == null ? "" : speechRate)
+ ";pitch=" + (pitch == null ? "" : pitch)
+ ";emotion=" + (emotion == null ? "" : emotion);
}
}
class TtsEngineResult {
private final boolean success;
@@ -11,5 +11,6 @@ public interface TtsTaskService extends IService<TtsTask> {
TtsTaskResponse getTask(String id);
TtsTaskResponse getBySource(String sourceType, String sourceId, String voice);
TtsTaskResponse getBySource(String sourceType, String sourceId, String voice,
Double speechRate, Double pitch, String emotion);
}
@@ -5,7 +5,9 @@ import org.springframework.beans.factory.annotation.Value;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Service;
import org.springframework.web.client.RestTemplate;
import org.springframework.util.StringUtils;
import java.util.HashMap;
import java.util.Map;
@Service
@@ -21,13 +23,23 @@ public class HttpTtsEngineClient implements TtsEngineClient {
}
@Override
public TtsEngineResult synthesize(String text, String voice, String outputPath) {
public TtsEngineResult synthesize(String text, String voice, String outputPath, SynthesisOptions options) {
try {
Map<String, Object> body = Map.of(
"text", text,
"voice", voice,
"outputPath", outputPath
);
Map<String, Object> body = new HashMap<>();
body.put("text", text);
body.put("voice", voice);
body.put("outputPath", outputPath);
if (options != null) {
if (options.getSpeechRate() != null) {
body.put("speechRate", options.getSpeechRate());
}
if (options.getPitch() != null) {
body.put("pitch", options.getPitch());
}
if (StringUtils.hasText(options.getEmotion())) {
body.put("emotion", options.getEmotion());
}
}
ResponseEntity<Map> response = restTemplate.postForEntity(engineUrl + "/synthesize", body, Map.class);
Map<?, ?> data = response.getBody();
boolean success = data != null && Boolean.TRUE.equals(data.get("success"));
@@ -19,6 +19,9 @@ import org.springframework.util.DigestUtils;
import org.springframework.util.StringUtils;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Executor;
@@ -32,6 +35,10 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
private static final String STATUS_PROCESSING = "processing";
private static final String STATUS_SUCCESS = "success";
private static final String STATUS_FAILED = "failed";
private static final double FALLBACK_SPEECH_RATE = 0.92D;
private static final double FALLBACK_PITCH = 0D;
private static final String FALLBACK_EMOTION = "story";
private static final int NATURAL_PARAGRAPH_LIMIT = 140;
private final EpicScriptMapper epicScriptMapper;
private final TtsEngineClient ttsEngineClient;
@@ -52,6 +59,15 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
@Value("${emotion.tts.default-voice:default_zh_female}")
private String defaultVoice;
@Value("${emotion.tts.default-speech-rate:0.92}")
private double defaultSpeechRate;
@Value("${emotion.tts.default-pitch:0}")
private double defaultPitch;
@Value("${emotion.tts.default-emotion:story}")
private String defaultEmotion;
public TtsTaskServiceImpl(EpicScriptMapper epicScriptMapper,
TtsEngineClient ttsEngineClient,
@Qualifier("taskExecutor") Executor taskExecutor) {
@@ -70,15 +86,16 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
String sourceType = normalizeSourceType(request.getSourceType());
String sourceId = request.getSourceId().trim();
String voice = resolveVoice(request.getVoice());
TtsEngineClient.SynthesisOptions options = resolveOptions(request);
String cleaned = cleanText(loadSourceText(userId, sourceType, sourceId));
if (!StringUtils.hasText(cleaned)) {
throw new IllegalArgumentException("Source text is empty");
}
if (cleaned.length() > maxTextLength) {
cleaned = cleaned.substring(0, maxTextLength);
cleaned = limitReadableText(cleaned, maxTextLength);
}
String hash = DigestUtils.md5DigestAsHex((voice + "\n" + cleaned).getBytes(StandardCharsets.UTF_8));
String hash = DigestUtils.md5DigestAsHex((voice + "\n" + options.cacheKey() + "\n" + cleaned).getBytes(StandardCharsets.UTF_8));
TtsTask owned = findOwnedTask(userId, sourceType, sourceId, voice, hash);
if (owned != null) {
incrementRequestCount(owned);
@@ -100,7 +117,7 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
TtsTask task = buildTask(userId, sourceType, sourceId, voice, hash, cleaned.length());
save(task);
String synthesisText = cleaned;
CompletableFuture.runAsync(() -> process(task.getId(), synthesisText, voice, task.getAudioPath()), taskExecutor);
CompletableFuture.runAsync(() -> process(task.getId(), synthesisText, voice, task.getAudioPath(), options), taskExecutor);
return toResponse(task);
}
@@ -115,20 +132,22 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
}
@Override
public TtsTaskResponse getBySource(String sourceType, String sourceId, String voice) {
public TtsTaskResponse getBySource(String sourceType, String sourceId, String voice,
Double speechRate, Double pitch, String emotion) {
String userId = currentUserId();
TtsTask task = getOne(new LambdaQueryWrapper<TtsTask>()
.eq(TtsTask::getUserId, userId)
.eq(TtsTask::getSourceType, normalizeSourceType(sourceType))
.eq(TtsTask::getSourceId, sourceId)
.eq(TtsTask::getVoice, resolveVoice(voice))
.eq(TtsTask::getIsDeleted, 0)
.orderByDesc(TtsTask::getCreateTime)
.last("LIMIT 1"));
String normalizedSourceType = normalizeSourceType(sourceType);
String normalizedVoice = resolveVoice(voice);
TtsEngineClient.SynthesisOptions options = resolveOptions(speechRate, pitch, emotion);
String cleaned = cleanText(loadSourceText(userId, normalizedSourceType, sourceId));
if (cleaned.length() > maxTextLength) {
cleaned = limitReadableText(cleaned, maxTextLength);
}
String hash = DigestUtils.md5DigestAsHex((normalizedVoice + "\n" + options.cacheKey() + "\n" + cleaned).getBytes(StandardCharsets.UTF_8));
TtsTask task = findOwnedTask(userId, normalizedSourceType, sourceId, normalizedVoice, hash);
return task == null ? null : toResponse(task);
}
private void process(String taskId, String text, String voice, String outputPath) {
private void process(String taskId, String text, String voice, String outputPath, TtsEngineClient.SynthesisOptions options) {
try {
TtsTask task = getById(taskId);
if (task == null) {
@@ -138,7 +157,7 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
task.setErrorMessage(null);
updateById(task);
TtsEngineClient.TtsEngineResult result = ttsEngineClient.synthesize(text, voice, outputPath);
TtsEngineClient.TtsEngineResult result = ttsEngineClient.synthesize(text, voice, outputPath, options);
task = getById(taskId);
if (task == null) {
return;
@@ -220,13 +239,15 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
StringBuilder text = new StringBuilder();
append(text, script.getTitle());
append(text, script.getPlotIntro());
append(text, script.getPlotTurning());
append(text, script.getPlotClimax());
append(text, script.getPlotEnding());
Map<String, Object> plotJson = script.getPlotJson();
if (plotJson != null && plotJson.get("fullContent") != null) {
append(text, String.valueOf(plotJson.get("fullContent")));
Object fullContent = plotJson == null ? null : plotJson.get("fullContent");
if (fullContent != null && StringUtils.hasText(String.valueOf(fullContent))) {
append(text, String.valueOf(fullContent));
} else {
append(text, script.getPlotIntro());
append(text, script.getPlotTurning());
append(text, script.getPlotClimax());
append(text, script.getPlotEnding());
}
return text.toString();
}
@@ -235,9 +256,39 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
if (text == null) {
return "";
}
return text.replaceAll("[#>*_`\\-]", "")
.replaceAll("\\s+", " ")
.trim();
String normalized = text.replace("\r\n", "\n")
.replace('\r', '\n')
.replaceAll("!\\[[^\\]]*]\\([^)]*\\)", "")
.replaceAll("\\[([^\\]]+)]\\([^)]*\\)", "$1")
.replaceAll("(?m)^\\s{0,3}#{1,6}\\s*", "")
.replaceAll("(?m)^\\s*>\\s?", "")
.replaceAll("(?m)^\\s*[-*+]\\s+", "")
.replaceAll("(?m)^\\s*\\d+[.)、]\\s+", "")
.replaceAll("<[^>]+>", "")
.replaceAll("[*_`~]", "")
.replaceAll("[“”]", "\"")
.replaceAll("[‘’]", "'")
.replaceAll("\\.{3,}", "……")
.replaceAll("-{2,}", "")
.replaceAll("[\\t\\u00A0]+", " ")
.replaceAll(" {2,}", " ")
.replaceAll("(?<=[\\p{IsHan}])[ \\t\\u00A0]+(?=[\\p{IsHan}])", "")
.replaceAll("[ \\t\\u00A0]*([,。!?;:、,.!?;:])[ \\t\\u00A0]*", "$1")
.replaceAll(",", "")
.replaceAll("!", "")
.replaceAll("\\?", "")
.replaceAll(";", "")
.replaceAll(":", "");
List<String> paragraphs = new ArrayList<>();
for (String paragraph : normalized.split("\\n+")) {
String trimmed = paragraph.trim();
if (!StringUtils.hasText(trimmed)) {
continue;
}
paragraphs.addAll(toReadableParagraphs(trimmed));
}
return String.join("\n\n", paragraphs).trim();
}
private TtsTaskResponse toResponse(TtsTask task) {
@@ -253,6 +304,23 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
.build();
}
private TtsEngineClient.SynthesisOptions resolveOptions(TtsTaskCreateRequest request) {
return resolveOptions(request.getSpeechRate(), request.getPitch(), request.getEmotion());
}
private TtsEngineClient.SynthesisOptions resolveOptions(Double requestSpeechRate, Double requestPitch, String requestEmotion) {
double speechRate = requestSpeechRate == null ? defaultOrFallback(defaultSpeechRate, FALLBACK_SPEECH_RATE) : requestSpeechRate;
double pitch = requestPitch == null ? defaultPitch : requestPitch;
String emotion = StringUtils.hasText(requestEmotion)
? requestEmotion.trim()
: (StringUtils.hasText(defaultEmotion) ? defaultEmotion.trim() : FALLBACK_EMOTION);
return new TtsEngineClient.SynthesisOptions(
round(clamp(speechRate, 0.60D, 1.40D)),
round(clamp(pitch, -20D, 20D)),
emotion.toLowerCase(Locale.ROOT)
);
}
private String currentUserId() {
String userId = UserContextHolder.getCurrentUserId();
if (!StringUtils.hasText(userId)) {
@@ -275,6 +343,91 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
}
}
private static List<String> toReadableParagraphs(String paragraph) {
List<String> parts = new ArrayList<>();
StringBuilder current = new StringBuilder();
for (int index = 0; index < paragraph.length(); index++) {
char ch = paragraph.charAt(index);
current.append(ch);
if (isHardSentenceEnd(ch) || (current.length() >= NATURAL_PARAGRAPH_LIMIT && isSoftPause(ch))) {
addReadablePart(parts, current);
}
}
addReadablePart(parts, current);
return parts;
}
private static void addReadablePart(List<String> parts, StringBuilder current) {
String value = current.toString().trim();
current.setLength(0);
if (!StringUtils.hasText(value)) {
return;
}
if (value.length() > NATURAL_PARAGRAPH_LIMIT + 40) {
splitLongText(parts, value);
return;
}
parts.add(ensureSentenceEnding(value));
}
private static void splitLongText(List<String> parts, String value) {
StringBuilder chunk = new StringBuilder();
for (int index = 0; index < value.length(); index++) {
char ch = value.charAt(index);
chunk.append(ch);
if (chunk.length() >= NATURAL_PARAGRAPH_LIMIT) {
parts.add(ensureSentenceEnding(chunk.toString().trim()));
chunk.setLength(0);
}
}
if (chunk.length() > 0) {
parts.add(ensureSentenceEnding(chunk.toString().trim()));
}
}
private static String ensureSentenceEnding(String value) {
if (!StringUtils.hasText(value)) {
return "";
}
char last = value.charAt(value.length() - 1);
return isHardSentenceEnd(last) || isSoftPause(last) ? value : value + "";
}
private static boolean isHardSentenceEnd(char ch) {
return ch == '。' || ch == '' || ch == '' || ch == '' || ch == '…';
}
private static boolean isSoftPause(char ch) {
return ch == '' || ch == '、' || ch == '';
}
private static String limitReadableText(String text, int limit) {
if (text.length() <= limit) {
return text;
}
String truncated = text.substring(0, limit);
int cut = Math.max(
Math.max(truncated.lastIndexOf('。'), truncated.lastIndexOf('')),
Math.max(truncated.lastIndexOf(''), truncated.lastIndexOf('\n'))
);
if (cut > limit * 0.75) {
return truncated.substring(0, cut + 1).trim();
}
return ensureSentenceEnding(truncated.trim());
}
private static double clamp(double value, double min, double max) {
return Math.max(min, Math.min(max, value));
}
private static double round(double value) {
return Math.round(value * 100D) / 100D;
}
private static double defaultOrFallback(double value, double fallback) {
return value <= 0D ? fallback : value;
}
private static String joinPath(String prefix, String filename) {
if (prefix.endsWith("/")) {
return prefix + filename;