feat: TTS 服务功能完善(任务管理、配置优化、客户端实现)
This commit is contained in:
@@ -61,8 +61,11 @@ public class TtsController {
|
||||
@GetMapping("/tasks/by-source")
|
||||
public Result<TtsTaskResponse> bySource(@Parameter(description = "来源类型") @RequestParam String sourceType,
|
||||
@Parameter(description = "来源 ID") @RequestParam String sourceId,
|
||||
@Parameter(description = "音色") @RequestParam(required = false) String voice) {
|
||||
return Result.success(ttsTaskService.getBySource(sourceType, sourceId, voice));
|
||||
@Parameter(description = "音色") @RequestParam(required = false) String voice,
|
||||
@Parameter(description = "语速") @RequestParam(required = false) Double speechRate,
|
||||
@Parameter(description = "音调") @RequestParam(required = false) Double pitch,
|
||||
@Parameter(description = "情绪") @RequestParam(required = false) String emotion) {
|
||||
return Result.success(ttsTaskService.getBySource(sourceType, sourceId, voice, speechRate, pitch, emotion));
|
||||
}
|
||||
|
||||
@Operation(summary = "获取音频文件", description = "返回已合成的音频音频文件(MP3 或 WAV 格式)。")
|
||||
|
||||
@@ -3,6 +3,8 @@ package com.emotion.dto.request.tts;
|
||||
import lombok.Data;
|
||||
|
||||
import javax.validation.constraints.NotBlank;
|
||||
import javax.validation.constraints.DecimalMax;
|
||||
import javax.validation.constraints.DecimalMin;
|
||||
import javax.validation.constraints.Size;
|
||||
|
||||
@Data
|
||||
@@ -18,4 +20,15 @@ public class TtsTaskCreateRequest {
|
||||
|
||||
@Size(max = 64)
|
||||
private String voice;
|
||||
|
||||
@DecimalMin("0.60")
|
||||
@DecimalMax("1.40")
|
||||
private Double speechRate;
|
||||
|
||||
@DecimalMin("-20.00")
|
||||
@DecimalMax("20.00")
|
||||
private Double pitch;
|
||||
|
||||
@Size(max = 32)
|
||||
private String emotion;
|
||||
}
|
||||
|
||||
@@ -12,6 +12,9 @@ public class TtsTaskResponse {
|
||||
private String sourceId;
|
||||
private String status;
|
||||
private String voice;
|
||||
private Double speechRate;
|
||||
private Double pitch;
|
||||
private String emotion;
|
||||
private String audioUrl;
|
||||
private Long durationMs;
|
||||
private String errorMessage;
|
||||
|
||||
@@ -2,7 +2,37 @@ package com.emotion.service;
|
||||
|
||||
public interface TtsEngineClient {
|
||||
|
||||
TtsEngineResult synthesize(String text, String voice, String outputPath);
|
||||
TtsEngineResult synthesize(String text, String voice, String outputPath, SynthesisOptions options);
|
||||
|
||||
class SynthesisOptions {
|
||||
private final Double speechRate;
|
||||
private final Double pitch;
|
||||
private final String emotion;
|
||||
|
||||
public SynthesisOptions(Double speechRate, Double pitch, String emotion) {
|
||||
this.speechRate = speechRate;
|
||||
this.pitch = pitch;
|
||||
this.emotion = emotion;
|
||||
}
|
||||
|
||||
public Double getSpeechRate() {
|
||||
return speechRate;
|
||||
}
|
||||
|
||||
public Double getPitch() {
|
||||
return pitch;
|
||||
}
|
||||
|
||||
public String getEmotion() {
|
||||
return emotion;
|
||||
}
|
||||
|
||||
public String cacheKey() {
|
||||
return "rate=" + (speechRate == null ? "" : speechRate)
|
||||
+ ";pitch=" + (pitch == null ? "" : pitch)
|
||||
+ ";emotion=" + (emotion == null ? "" : emotion);
|
||||
}
|
||||
}
|
||||
|
||||
class TtsEngineResult {
|
||||
private final boolean success;
|
||||
|
||||
@@ -11,5 +11,6 @@ public interface TtsTaskService extends IService<TtsTask> {
|
||||
|
||||
TtsTaskResponse getTask(String id);
|
||||
|
||||
TtsTaskResponse getBySource(String sourceType, String sourceId, String voice);
|
||||
TtsTaskResponse getBySource(String sourceType, String sourceId, String voice,
|
||||
Double speechRate, Double pitch, String emotion);
|
||||
}
|
||||
|
||||
@@ -5,7 +5,9 @@ import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.web.client.RestTemplate;
|
||||
import org.springframework.util.StringUtils;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
@Service
|
||||
@@ -21,13 +23,23 @@ public class HttpTtsEngineClient implements TtsEngineClient {
|
||||
}
|
||||
|
||||
@Override
|
||||
public TtsEngineResult synthesize(String text, String voice, String outputPath) {
|
||||
public TtsEngineResult synthesize(String text, String voice, String outputPath, SynthesisOptions options) {
|
||||
try {
|
||||
Map<String, Object> body = Map.of(
|
||||
"text", text,
|
||||
"voice", voice,
|
||||
"outputPath", outputPath
|
||||
);
|
||||
Map<String, Object> body = new HashMap<>();
|
||||
body.put("text", text);
|
||||
body.put("voice", voice);
|
||||
body.put("outputPath", outputPath);
|
||||
if (options != null) {
|
||||
if (options.getSpeechRate() != null) {
|
||||
body.put("speechRate", options.getSpeechRate());
|
||||
}
|
||||
if (options.getPitch() != null) {
|
||||
body.put("pitch", options.getPitch());
|
||||
}
|
||||
if (StringUtils.hasText(options.getEmotion())) {
|
||||
body.put("emotion", options.getEmotion());
|
||||
}
|
||||
}
|
||||
ResponseEntity<Map> response = restTemplate.postForEntity(engineUrl + "/synthesize", body, Map.class);
|
||||
Map<?, ?> data = response.getBody();
|
||||
boolean success = data != null && Boolean.TRUE.equals(data.get("success"));
|
||||
|
||||
@@ -19,6 +19,9 @@ import org.springframework.util.DigestUtils;
|
||||
import org.springframework.util.StringUtils;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.Executor;
|
||||
@@ -32,6 +35,10 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
|
||||
private static final String STATUS_PROCESSING = "processing";
|
||||
private static final String STATUS_SUCCESS = "success";
|
||||
private static final String STATUS_FAILED = "failed";
|
||||
private static final double FALLBACK_SPEECH_RATE = 0.92D;
|
||||
private static final double FALLBACK_PITCH = 0D;
|
||||
private static final String FALLBACK_EMOTION = "story";
|
||||
private static final int NATURAL_PARAGRAPH_LIMIT = 140;
|
||||
|
||||
private final EpicScriptMapper epicScriptMapper;
|
||||
private final TtsEngineClient ttsEngineClient;
|
||||
@@ -52,6 +59,15 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
|
||||
@Value("${emotion.tts.default-voice:default_zh_female}")
|
||||
private String defaultVoice;
|
||||
|
||||
@Value("${emotion.tts.default-speech-rate:0.92}")
|
||||
private double defaultSpeechRate;
|
||||
|
||||
@Value("${emotion.tts.default-pitch:0}")
|
||||
private double defaultPitch;
|
||||
|
||||
@Value("${emotion.tts.default-emotion:story}")
|
||||
private String defaultEmotion;
|
||||
|
||||
public TtsTaskServiceImpl(EpicScriptMapper epicScriptMapper,
|
||||
TtsEngineClient ttsEngineClient,
|
||||
@Qualifier("taskExecutor") Executor taskExecutor) {
|
||||
@@ -70,15 +86,16 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
|
||||
String sourceType = normalizeSourceType(request.getSourceType());
|
||||
String sourceId = request.getSourceId().trim();
|
||||
String voice = resolveVoice(request.getVoice());
|
||||
TtsEngineClient.SynthesisOptions options = resolveOptions(request);
|
||||
String cleaned = cleanText(loadSourceText(userId, sourceType, sourceId));
|
||||
if (!StringUtils.hasText(cleaned)) {
|
||||
throw new IllegalArgumentException("Source text is empty");
|
||||
}
|
||||
if (cleaned.length() > maxTextLength) {
|
||||
cleaned = cleaned.substring(0, maxTextLength);
|
||||
cleaned = limitReadableText(cleaned, maxTextLength);
|
||||
}
|
||||
|
||||
String hash = DigestUtils.md5DigestAsHex((voice + "\n" + cleaned).getBytes(StandardCharsets.UTF_8));
|
||||
String hash = DigestUtils.md5DigestAsHex((voice + "\n" + options.cacheKey() + "\n" + cleaned).getBytes(StandardCharsets.UTF_8));
|
||||
TtsTask owned = findOwnedTask(userId, sourceType, sourceId, voice, hash);
|
||||
if (owned != null) {
|
||||
incrementRequestCount(owned);
|
||||
@@ -100,7 +117,7 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
|
||||
TtsTask task = buildTask(userId, sourceType, sourceId, voice, hash, cleaned.length());
|
||||
save(task);
|
||||
String synthesisText = cleaned;
|
||||
CompletableFuture.runAsync(() -> process(task.getId(), synthesisText, voice, task.getAudioPath()), taskExecutor);
|
||||
CompletableFuture.runAsync(() -> process(task.getId(), synthesisText, voice, task.getAudioPath(), options), taskExecutor);
|
||||
return toResponse(task);
|
||||
}
|
||||
|
||||
@@ -115,20 +132,22 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
|
||||
}
|
||||
|
||||
@Override
|
||||
public TtsTaskResponse getBySource(String sourceType, String sourceId, String voice) {
|
||||
public TtsTaskResponse getBySource(String sourceType, String sourceId, String voice,
|
||||
Double speechRate, Double pitch, String emotion) {
|
||||
String userId = currentUserId();
|
||||
TtsTask task = getOne(new LambdaQueryWrapper<TtsTask>()
|
||||
.eq(TtsTask::getUserId, userId)
|
||||
.eq(TtsTask::getSourceType, normalizeSourceType(sourceType))
|
||||
.eq(TtsTask::getSourceId, sourceId)
|
||||
.eq(TtsTask::getVoice, resolveVoice(voice))
|
||||
.eq(TtsTask::getIsDeleted, 0)
|
||||
.orderByDesc(TtsTask::getCreateTime)
|
||||
.last("LIMIT 1"));
|
||||
String normalizedSourceType = normalizeSourceType(sourceType);
|
||||
String normalizedVoice = resolveVoice(voice);
|
||||
TtsEngineClient.SynthesisOptions options = resolveOptions(speechRate, pitch, emotion);
|
||||
String cleaned = cleanText(loadSourceText(userId, normalizedSourceType, sourceId));
|
||||
if (cleaned.length() > maxTextLength) {
|
||||
cleaned = limitReadableText(cleaned, maxTextLength);
|
||||
}
|
||||
String hash = DigestUtils.md5DigestAsHex((normalizedVoice + "\n" + options.cacheKey() + "\n" + cleaned).getBytes(StandardCharsets.UTF_8));
|
||||
TtsTask task = findOwnedTask(userId, normalizedSourceType, sourceId, normalizedVoice, hash);
|
||||
return task == null ? null : toResponse(task);
|
||||
}
|
||||
|
||||
private void process(String taskId, String text, String voice, String outputPath) {
|
||||
private void process(String taskId, String text, String voice, String outputPath, TtsEngineClient.SynthesisOptions options) {
|
||||
try {
|
||||
TtsTask task = getById(taskId);
|
||||
if (task == null) {
|
||||
@@ -138,7 +157,7 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
|
||||
task.setErrorMessage(null);
|
||||
updateById(task);
|
||||
|
||||
TtsEngineClient.TtsEngineResult result = ttsEngineClient.synthesize(text, voice, outputPath);
|
||||
TtsEngineClient.TtsEngineResult result = ttsEngineClient.synthesize(text, voice, outputPath, options);
|
||||
task = getById(taskId);
|
||||
if (task == null) {
|
||||
return;
|
||||
@@ -220,13 +239,15 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
|
||||
|
||||
StringBuilder text = new StringBuilder();
|
||||
append(text, script.getTitle());
|
||||
append(text, script.getPlotIntro());
|
||||
append(text, script.getPlotTurning());
|
||||
append(text, script.getPlotClimax());
|
||||
append(text, script.getPlotEnding());
|
||||
Map<String, Object> plotJson = script.getPlotJson();
|
||||
if (plotJson != null && plotJson.get("fullContent") != null) {
|
||||
append(text, String.valueOf(plotJson.get("fullContent")));
|
||||
Object fullContent = plotJson == null ? null : plotJson.get("fullContent");
|
||||
if (fullContent != null && StringUtils.hasText(String.valueOf(fullContent))) {
|
||||
append(text, String.valueOf(fullContent));
|
||||
} else {
|
||||
append(text, script.getPlotIntro());
|
||||
append(text, script.getPlotTurning());
|
||||
append(text, script.getPlotClimax());
|
||||
append(text, script.getPlotEnding());
|
||||
}
|
||||
return text.toString();
|
||||
}
|
||||
@@ -235,9 +256,39 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
|
||||
if (text == null) {
|
||||
return "";
|
||||
}
|
||||
return text.replaceAll("[#>*_`\\-]", "")
|
||||
.replaceAll("\\s+", " ")
|
||||
.trim();
|
||||
String normalized = text.replace("\r\n", "\n")
|
||||
.replace('\r', '\n')
|
||||
.replaceAll("!\\[[^\\]]*]\\([^)]*\\)", "")
|
||||
.replaceAll("\\[([^\\]]+)]\\([^)]*\\)", "$1")
|
||||
.replaceAll("(?m)^\\s{0,3}#{1,6}\\s*", "")
|
||||
.replaceAll("(?m)^\\s*>\\s?", "")
|
||||
.replaceAll("(?m)^\\s*[-*+]\\s+", "")
|
||||
.replaceAll("(?m)^\\s*\\d+[.)、]\\s+", "")
|
||||
.replaceAll("<[^>]+>", "")
|
||||
.replaceAll("[*_`~]", "")
|
||||
.replaceAll("[“”]", "\"")
|
||||
.replaceAll("[‘’]", "'")
|
||||
.replaceAll("\\.{3,}", "……")
|
||||
.replaceAll("-{2,}", ",")
|
||||
.replaceAll("[\\t\\u00A0]+", " ")
|
||||
.replaceAll(" {2,}", " ")
|
||||
.replaceAll("(?<=[\\p{IsHan}])[ \\t\\u00A0]+(?=[\\p{IsHan}])", "")
|
||||
.replaceAll("[ \\t\\u00A0]*([,。!?;:、,.!?;:])[ \\t\\u00A0]*", "$1")
|
||||
.replaceAll(",", ",")
|
||||
.replaceAll("!", "!")
|
||||
.replaceAll("\\?", "?")
|
||||
.replaceAll(";", ";")
|
||||
.replaceAll(":", ":");
|
||||
|
||||
List<String> paragraphs = new ArrayList<>();
|
||||
for (String paragraph : normalized.split("\\n+")) {
|
||||
String trimmed = paragraph.trim();
|
||||
if (!StringUtils.hasText(trimmed)) {
|
||||
continue;
|
||||
}
|
||||
paragraphs.addAll(toReadableParagraphs(trimmed));
|
||||
}
|
||||
return String.join("\n\n", paragraphs).trim();
|
||||
}
|
||||
|
||||
private TtsTaskResponse toResponse(TtsTask task) {
|
||||
@@ -253,6 +304,23 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
|
||||
.build();
|
||||
}
|
||||
|
||||
private TtsEngineClient.SynthesisOptions resolveOptions(TtsTaskCreateRequest request) {
|
||||
return resolveOptions(request.getSpeechRate(), request.getPitch(), request.getEmotion());
|
||||
}
|
||||
|
||||
private TtsEngineClient.SynthesisOptions resolveOptions(Double requestSpeechRate, Double requestPitch, String requestEmotion) {
|
||||
double speechRate = requestSpeechRate == null ? defaultOrFallback(defaultSpeechRate, FALLBACK_SPEECH_RATE) : requestSpeechRate;
|
||||
double pitch = requestPitch == null ? defaultPitch : requestPitch;
|
||||
String emotion = StringUtils.hasText(requestEmotion)
|
||||
? requestEmotion.trim()
|
||||
: (StringUtils.hasText(defaultEmotion) ? defaultEmotion.trim() : FALLBACK_EMOTION);
|
||||
return new TtsEngineClient.SynthesisOptions(
|
||||
round(clamp(speechRate, 0.60D, 1.40D)),
|
||||
round(clamp(pitch, -20D, 20D)),
|
||||
emotion.toLowerCase(Locale.ROOT)
|
||||
);
|
||||
}
|
||||
|
||||
private String currentUserId() {
|
||||
String userId = UserContextHolder.getCurrentUserId();
|
||||
if (!StringUtils.hasText(userId)) {
|
||||
@@ -275,6 +343,91 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
|
||||
}
|
||||
}
|
||||
|
||||
private static List<String> toReadableParagraphs(String paragraph) {
|
||||
List<String> parts = new ArrayList<>();
|
||||
StringBuilder current = new StringBuilder();
|
||||
for (int index = 0; index < paragraph.length(); index++) {
|
||||
char ch = paragraph.charAt(index);
|
||||
current.append(ch);
|
||||
if (isHardSentenceEnd(ch) || (current.length() >= NATURAL_PARAGRAPH_LIMIT && isSoftPause(ch))) {
|
||||
addReadablePart(parts, current);
|
||||
}
|
||||
}
|
||||
addReadablePart(parts, current);
|
||||
return parts;
|
||||
}
|
||||
|
||||
private static void addReadablePart(List<String> parts, StringBuilder current) {
|
||||
String value = current.toString().trim();
|
||||
current.setLength(0);
|
||||
if (!StringUtils.hasText(value)) {
|
||||
return;
|
||||
}
|
||||
if (value.length() > NATURAL_PARAGRAPH_LIMIT + 40) {
|
||||
splitLongText(parts, value);
|
||||
return;
|
||||
}
|
||||
parts.add(ensureSentenceEnding(value));
|
||||
}
|
||||
|
||||
private static void splitLongText(List<String> parts, String value) {
|
||||
StringBuilder chunk = new StringBuilder();
|
||||
for (int index = 0; index < value.length(); index++) {
|
||||
char ch = value.charAt(index);
|
||||
chunk.append(ch);
|
||||
if (chunk.length() >= NATURAL_PARAGRAPH_LIMIT) {
|
||||
parts.add(ensureSentenceEnding(chunk.toString().trim()));
|
||||
chunk.setLength(0);
|
||||
}
|
||||
}
|
||||
if (chunk.length() > 0) {
|
||||
parts.add(ensureSentenceEnding(chunk.toString().trim()));
|
||||
}
|
||||
}
|
||||
|
||||
private static String ensureSentenceEnding(String value) {
|
||||
if (!StringUtils.hasText(value)) {
|
||||
return "";
|
||||
}
|
||||
char last = value.charAt(value.length() - 1);
|
||||
return isHardSentenceEnd(last) || isSoftPause(last) ? value : value + "。";
|
||||
}
|
||||
|
||||
private static boolean isHardSentenceEnd(char ch) {
|
||||
return ch == '。' || ch == '!' || ch == '?' || ch == ';' || ch == '…';
|
||||
}
|
||||
|
||||
private static boolean isSoftPause(char ch) {
|
||||
return ch == ',' || ch == '、' || ch == ':';
|
||||
}
|
||||
|
||||
private static String limitReadableText(String text, int limit) {
|
||||
if (text.length() <= limit) {
|
||||
return text;
|
||||
}
|
||||
String truncated = text.substring(0, limit);
|
||||
int cut = Math.max(
|
||||
Math.max(truncated.lastIndexOf('。'), truncated.lastIndexOf('!')),
|
||||
Math.max(truncated.lastIndexOf('?'), truncated.lastIndexOf('\n'))
|
||||
);
|
||||
if (cut > limit * 0.75) {
|
||||
return truncated.substring(0, cut + 1).trim();
|
||||
}
|
||||
return ensureSentenceEnding(truncated.trim());
|
||||
}
|
||||
|
||||
private static double clamp(double value, double min, double max) {
|
||||
return Math.max(min, Math.min(max, value));
|
||||
}
|
||||
|
||||
private static double round(double value) {
|
||||
return Math.round(value * 100D) / 100D;
|
||||
}
|
||||
|
||||
private static double defaultOrFallback(double value, double fallback) {
|
||||
return value <= 0D ? fallback : value;
|
||||
}
|
||||
|
||||
private static String joinPath(String prefix, String filename) {
|
||||
if (prefix.endsWith("/")) {
|
||||
return prefix + filename;
|
||||
|
||||
Reference in New Issue
Block a user