feat: TTS 服务功能完善(任务管理、配置优化、客户端实现)
This commit is contained in:
@@ -61,8 +61,11 @@ public class TtsController {
|
|||||||
@GetMapping("/tasks/by-source")
|
@GetMapping("/tasks/by-source")
|
||||||
public Result<TtsTaskResponse> bySource(@Parameter(description = "来源类型") @RequestParam String sourceType,
|
public Result<TtsTaskResponse> bySource(@Parameter(description = "来源类型") @RequestParam String sourceType,
|
||||||
@Parameter(description = "来源 ID") @RequestParam String sourceId,
|
@Parameter(description = "来源 ID") @RequestParam String sourceId,
|
||||||
@Parameter(description = "音色") @RequestParam(required = false) String voice) {
|
@Parameter(description = "音色") @RequestParam(required = false) String voice,
|
||||||
return Result.success(ttsTaskService.getBySource(sourceType, sourceId, voice));
|
@Parameter(description = "语速") @RequestParam(required = false) Double speechRate,
|
||||||
|
@Parameter(description = "音调") @RequestParam(required = false) Double pitch,
|
||||||
|
@Parameter(description = "情绪") @RequestParam(required = false) String emotion) {
|
||||||
|
return Result.success(ttsTaskService.getBySource(sourceType, sourceId, voice, speechRate, pitch, emotion));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Operation(summary = "获取音频文件", description = "返回已合成的音频音频文件(MP3 或 WAV 格式)。")
|
@Operation(summary = "获取音频文件", description = "返回已合成的音频音频文件(MP3 或 WAV 格式)。")
|
||||||
|
|||||||
@@ -3,6 +3,8 @@ package com.emotion.dto.request.tts;
|
|||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
import javax.validation.constraints.NotBlank;
|
import javax.validation.constraints.NotBlank;
|
||||||
|
import javax.validation.constraints.DecimalMax;
|
||||||
|
import javax.validation.constraints.DecimalMin;
|
||||||
import javax.validation.constraints.Size;
|
import javax.validation.constraints.Size;
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
@@ -18,4 +20,15 @@ public class TtsTaskCreateRequest {
|
|||||||
|
|
||||||
@Size(max = 64)
|
@Size(max = 64)
|
||||||
private String voice;
|
private String voice;
|
||||||
|
|
||||||
|
@DecimalMin("0.60")
|
||||||
|
@DecimalMax("1.40")
|
||||||
|
private Double speechRate;
|
||||||
|
|
||||||
|
@DecimalMin("-20.00")
|
||||||
|
@DecimalMax("20.00")
|
||||||
|
private Double pitch;
|
||||||
|
|
||||||
|
@Size(max = 32)
|
||||||
|
private String emotion;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,6 +12,9 @@ public class TtsTaskResponse {
|
|||||||
private String sourceId;
|
private String sourceId;
|
||||||
private String status;
|
private String status;
|
||||||
private String voice;
|
private String voice;
|
||||||
|
private Double speechRate;
|
||||||
|
private Double pitch;
|
||||||
|
private String emotion;
|
||||||
private String audioUrl;
|
private String audioUrl;
|
||||||
private Long durationMs;
|
private Long durationMs;
|
||||||
private String errorMessage;
|
private String errorMessage;
|
||||||
|
|||||||
@@ -2,7 +2,37 @@ package com.emotion.service;
|
|||||||
|
|
||||||
public interface TtsEngineClient {
|
public interface TtsEngineClient {
|
||||||
|
|
||||||
TtsEngineResult synthesize(String text, String voice, String outputPath);
|
TtsEngineResult synthesize(String text, String voice, String outputPath, SynthesisOptions options);
|
||||||
|
|
||||||
|
class SynthesisOptions {
|
||||||
|
private final Double speechRate;
|
||||||
|
private final Double pitch;
|
||||||
|
private final String emotion;
|
||||||
|
|
||||||
|
public SynthesisOptions(Double speechRate, Double pitch, String emotion) {
|
||||||
|
this.speechRate = speechRate;
|
||||||
|
this.pitch = pitch;
|
||||||
|
this.emotion = emotion;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Double getSpeechRate() {
|
||||||
|
return speechRate;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Double getPitch() {
|
||||||
|
return pitch;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getEmotion() {
|
||||||
|
return emotion;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String cacheKey() {
|
||||||
|
return "rate=" + (speechRate == null ? "" : speechRate)
|
||||||
|
+ ";pitch=" + (pitch == null ? "" : pitch)
|
||||||
|
+ ";emotion=" + (emotion == null ? "" : emotion);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
class TtsEngineResult {
|
class TtsEngineResult {
|
||||||
private final boolean success;
|
private final boolean success;
|
||||||
|
|||||||
@@ -11,5 +11,6 @@ public interface TtsTaskService extends IService<TtsTask> {
|
|||||||
|
|
||||||
TtsTaskResponse getTask(String id);
|
TtsTaskResponse getTask(String id);
|
||||||
|
|
||||||
TtsTaskResponse getBySource(String sourceType, String sourceId, String voice);
|
TtsTaskResponse getBySource(String sourceType, String sourceId, String voice,
|
||||||
|
Double speechRate, Double pitch, String emotion);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,7 +5,9 @@ import org.springframework.beans.factory.annotation.Value;
|
|||||||
import org.springframework.http.ResponseEntity;
|
import org.springframework.http.ResponseEntity;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
import org.springframework.web.client.RestTemplate;
|
import org.springframework.web.client.RestTemplate;
|
||||||
|
import org.springframework.util.StringUtils;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
@@ -21,13 +23,23 @@ public class HttpTtsEngineClient implements TtsEngineClient {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TtsEngineResult synthesize(String text, String voice, String outputPath) {
|
public TtsEngineResult synthesize(String text, String voice, String outputPath, SynthesisOptions options) {
|
||||||
try {
|
try {
|
||||||
Map<String, Object> body = Map.of(
|
Map<String, Object> body = new HashMap<>();
|
||||||
"text", text,
|
body.put("text", text);
|
||||||
"voice", voice,
|
body.put("voice", voice);
|
||||||
"outputPath", outputPath
|
body.put("outputPath", outputPath);
|
||||||
);
|
if (options != null) {
|
||||||
|
if (options.getSpeechRate() != null) {
|
||||||
|
body.put("speechRate", options.getSpeechRate());
|
||||||
|
}
|
||||||
|
if (options.getPitch() != null) {
|
||||||
|
body.put("pitch", options.getPitch());
|
||||||
|
}
|
||||||
|
if (StringUtils.hasText(options.getEmotion())) {
|
||||||
|
body.put("emotion", options.getEmotion());
|
||||||
|
}
|
||||||
|
}
|
||||||
ResponseEntity<Map> response = restTemplate.postForEntity(engineUrl + "/synthesize", body, Map.class);
|
ResponseEntity<Map> response = restTemplate.postForEntity(engineUrl + "/synthesize", body, Map.class);
|
||||||
Map<?, ?> data = response.getBody();
|
Map<?, ?> data = response.getBody();
|
||||||
boolean success = data != null && Boolean.TRUE.equals(data.get("success"));
|
boolean success = data != null && Boolean.TRUE.equals(data.get("success"));
|
||||||
|
|||||||
@@ -19,6 +19,9 @@ import org.springframework.util.DigestUtils;
|
|||||||
import org.springframework.util.StringUtils;
|
import org.springframework.util.StringUtils;
|
||||||
|
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.concurrent.CompletableFuture;
|
import java.util.concurrent.CompletableFuture;
|
||||||
import java.util.concurrent.Executor;
|
import java.util.concurrent.Executor;
|
||||||
@@ -32,6 +35,10 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
|
|||||||
private static final String STATUS_PROCESSING = "processing";
|
private static final String STATUS_PROCESSING = "processing";
|
||||||
private static final String STATUS_SUCCESS = "success";
|
private static final String STATUS_SUCCESS = "success";
|
||||||
private static final String STATUS_FAILED = "failed";
|
private static final String STATUS_FAILED = "failed";
|
||||||
|
private static final double FALLBACK_SPEECH_RATE = 0.92D;
|
||||||
|
private static final double FALLBACK_PITCH = 0D;
|
||||||
|
private static final String FALLBACK_EMOTION = "story";
|
||||||
|
private static final int NATURAL_PARAGRAPH_LIMIT = 140;
|
||||||
|
|
||||||
private final EpicScriptMapper epicScriptMapper;
|
private final EpicScriptMapper epicScriptMapper;
|
||||||
private final TtsEngineClient ttsEngineClient;
|
private final TtsEngineClient ttsEngineClient;
|
||||||
@@ -52,6 +59,15 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
|
|||||||
@Value("${emotion.tts.default-voice:default_zh_female}")
|
@Value("${emotion.tts.default-voice:default_zh_female}")
|
||||||
private String defaultVoice;
|
private String defaultVoice;
|
||||||
|
|
||||||
|
@Value("${emotion.tts.default-speech-rate:0.92}")
|
||||||
|
private double defaultSpeechRate;
|
||||||
|
|
||||||
|
@Value("${emotion.tts.default-pitch:0}")
|
||||||
|
private double defaultPitch;
|
||||||
|
|
||||||
|
@Value("${emotion.tts.default-emotion:story}")
|
||||||
|
private String defaultEmotion;
|
||||||
|
|
||||||
public TtsTaskServiceImpl(EpicScriptMapper epicScriptMapper,
|
public TtsTaskServiceImpl(EpicScriptMapper epicScriptMapper,
|
||||||
TtsEngineClient ttsEngineClient,
|
TtsEngineClient ttsEngineClient,
|
||||||
@Qualifier("taskExecutor") Executor taskExecutor) {
|
@Qualifier("taskExecutor") Executor taskExecutor) {
|
||||||
@@ -70,15 +86,16 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
|
|||||||
String sourceType = normalizeSourceType(request.getSourceType());
|
String sourceType = normalizeSourceType(request.getSourceType());
|
||||||
String sourceId = request.getSourceId().trim();
|
String sourceId = request.getSourceId().trim();
|
||||||
String voice = resolveVoice(request.getVoice());
|
String voice = resolveVoice(request.getVoice());
|
||||||
|
TtsEngineClient.SynthesisOptions options = resolveOptions(request);
|
||||||
String cleaned = cleanText(loadSourceText(userId, sourceType, sourceId));
|
String cleaned = cleanText(loadSourceText(userId, sourceType, sourceId));
|
||||||
if (!StringUtils.hasText(cleaned)) {
|
if (!StringUtils.hasText(cleaned)) {
|
||||||
throw new IllegalArgumentException("Source text is empty");
|
throw new IllegalArgumentException("Source text is empty");
|
||||||
}
|
}
|
||||||
if (cleaned.length() > maxTextLength) {
|
if (cleaned.length() > maxTextLength) {
|
||||||
cleaned = cleaned.substring(0, maxTextLength);
|
cleaned = limitReadableText(cleaned, maxTextLength);
|
||||||
}
|
}
|
||||||
|
|
||||||
String hash = DigestUtils.md5DigestAsHex((voice + "\n" + cleaned).getBytes(StandardCharsets.UTF_8));
|
String hash = DigestUtils.md5DigestAsHex((voice + "\n" + options.cacheKey() + "\n" + cleaned).getBytes(StandardCharsets.UTF_8));
|
||||||
TtsTask owned = findOwnedTask(userId, sourceType, sourceId, voice, hash);
|
TtsTask owned = findOwnedTask(userId, sourceType, sourceId, voice, hash);
|
||||||
if (owned != null) {
|
if (owned != null) {
|
||||||
incrementRequestCount(owned);
|
incrementRequestCount(owned);
|
||||||
@@ -100,7 +117,7 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
|
|||||||
TtsTask task = buildTask(userId, sourceType, sourceId, voice, hash, cleaned.length());
|
TtsTask task = buildTask(userId, sourceType, sourceId, voice, hash, cleaned.length());
|
||||||
save(task);
|
save(task);
|
||||||
String synthesisText = cleaned;
|
String synthesisText = cleaned;
|
||||||
CompletableFuture.runAsync(() -> process(task.getId(), synthesisText, voice, task.getAudioPath()), taskExecutor);
|
CompletableFuture.runAsync(() -> process(task.getId(), synthesisText, voice, task.getAudioPath(), options), taskExecutor);
|
||||||
return toResponse(task);
|
return toResponse(task);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -115,20 +132,22 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TtsTaskResponse getBySource(String sourceType, String sourceId, String voice) {
|
public TtsTaskResponse getBySource(String sourceType, String sourceId, String voice,
|
||||||
|
Double speechRate, Double pitch, String emotion) {
|
||||||
String userId = currentUserId();
|
String userId = currentUserId();
|
||||||
TtsTask task = getOne(new LambdaQueryWrapper<TtsTask>()
|
String normalizedSourceType = normalizeSourceType(sourceType);
|
||||||
.eq(TtsTask::getUserId, userId)
|
String normalizedVoice = resolveVoice(voice);
|
||||||
.eq(TtsTask::getSourceType, normalizeSourceType(sourceType))
|
TtsEngineClient.SynthesisOptions options = resolveOptions(speechRate, pitch, emotion);
|
||||||
.eq(TtsTask::getSourceId, sourceId)
|
String cleaned = cleanText(loadSourceText(userId, normalizedSourceType, sourceId));
|
||||||
.eq(TtsTask::getVoice, resolveVoice(voice))
|
if (cleaned.length() > maxTextLength) {
|
||||||
.eq(TtsTask::getIsDeleted, 0)
|
cleaned = limitReadableText(cleaned, maxTextLength);
|
||||||
.orderByDesc(TtsTask::getCreateTime)
|
}
|
||||||
.last("LIMIT 1"));
|
String hash = DigestUtils.md5DigestAsHex((normalizedVoice + "\n" + options.cacheKey() + "\n" + cleaned).getBytes(StandardCharsets.UTF_8));
|
||||||
|
TtsTask task = findOwnedTask(userId, normalizedSourceType, sourceId, normalizedVoice, hash);
|
||||||
return task == null ? null : toResponse(task);
|
return task == null ? null : toResponse(task);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void process(String taskId, String text, String voice, String outputPath) {
|
private void process(String taskId, String text, String voice, String outputPath, TtsEngineClient.SynthesisOptions options) {
|
||||||
try {
|
try {
|
||||||
TtsTask task = getById(taskId);
|
TtsTask task = getById(taskId);
|
||||||
if (task == null) {
|
if (task == null) {
|
||||||
@@ -138,7 +157,7 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
|
|||||||
task.setErrorMessage(null);
|
task.setErrorMessage(null);
|
||||||
updateById(task);
|
updateById(task);
|
||||||
|
|
||||||
TtsEngineClient.TtsEngineResult result = ttsEngineClient.synthesize(text, voice, outputPath);
|
TtsEngineClient.TtsEngineResult result = ttsEngineClient.synthesize(text, voice, outputPath, options);
|
||||||
task = getById(taskId);
|
task = getById(taskId);
|
||||||
if (task == null) {
|
if (task == null) {
|
||||||
return;
|
return;
|
||||||
@@ -220,13 +239,15 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
|
|||||||
|
|
||||||
StringBuilder text = new StringBuilder();
|
StringBuilder text = new StringBuilder();
|
||||||
append(text, script.getTitle());
|
append(text, script.getTitle());
|
||||||
|
Map<String, Object> plotJson = script.getPlotJson();
|
||||||
|
Object fullContent = plotJson == null ? null : plotJson.get("fullContent");
|
||||||
|
if (fullContent != null && StringUtils.hasText(String.valueOf(fullContent))) {
|
||||||
|
append(text, String.valueOf(fullContent));
|
||||||
|
} else {
|
||||||
append(text, script.getPlotIntro());
|
append(text, script.getPlotIntro());
|
||||||
append(text, script.getPlotTurning());
|
append(text, script.getPlotTurning());
|
||||||
append(text, script.getPlotClimax());
|
append(text, script.getPlotClimax());
|
||||||
append(text, script.getPlotEnding());
|
append(text, script.getPlotEnding());
|
||||||
Map<String, Object> plotJson = script.getPlotJson();
|
|
||||||
if (plotJson != null && plotJson.get("fullContent") != null) {
|
|
||||||
append(text, String.valueOf(plotJson.get("fullContent")));
|
|
||||||
}
|
}
|
||||||
return text.toString();
|
return text.toString();
|
||||||
}
|
}
|
||||||
@@ -235,9 +256,39 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
|
|||||||
if (text == null) {
|
if (text == null) {
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
return text.replaceAll("[#>*_`\\-]", "")
|
String normalized = text.replace("\r\n", "\n")
|
||||||
.replaceAll("\\s+", " ")
|
.replace('\r', '\n')
|
||||||
.trim();
|
.replaceAll("!\\[[^\\]]*]\\([^)]*\\)", "")
|
||||||
|
.replaceAll("\\[([^\\]]+)]\\([^)]*\\)", "$1")
|
||||||
|
.replaceAll("(?m)^\\s{0,3}#{1,6}\\s*", "")
|
||||||
|
.replaceAll("(?m)^\\s*>\\s?", "")
|
||||||
|
.replaceAll("(?m)^\\s*[-*+]\\s+", "")
|
||||||
|
.replaceAll("(?m)^\\s*\\d+[.)、]\\s+", "")
|
||||||
|
.replaceAll("<[^>]+>", "")
|
||||||
|
.replaceAll("[*_`~]", "")
|
||||||
|
.replaceAll("[“”]", "\"")
|
||||||
|
.replaceAll("[‘’]", "'")
|
||||||
|
.replaceAll("\\.{3,}", "……")
|
||||||
|
.replaceAll("-{2,}", ",")
|
||||||
|
.replaceAll("[\\t\\u00A0]+", " ")
|
||||||
|
.replaceAll(" {2,}", " ")
|
||||||
|
.replaceAll("(?<=[\\p{IsHan}])[ \\t\\u00A0]+(?=[\\p{IsHan}])", "")
|
||||||
|
.replaceAll("[ \\t\\u00A0]*([,。!?;:、,.!?;:])[ \\t\\u00A0]*", "$1")
|
||||||
|
.replaceAll(",", ",")
|
||||||
|
.replaceAll("!", "!")
|
||||||
|
.replaceAll("\\?", "?")
|
||||||
|
.replaceAll(";", ";")
|
||||||
|
.replaceAll(":", ":");
|
||||||
|
|
||||||
|
List<String> paragraphs = new ArrayList<>();
|
||||||
|
for (String paragraph : normalized.split("\\n+")) {
|
||||||
|
String trimmed = paragraph.trim();
|
||||||
|
if (!StringUtils.hasText(trimmed)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
paragraphs.addAll(toReadableParagraphs(trimmed));
|
||||||
|
}
|
||||||
|
return String.join("\n\n", paragraphs).trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
private TtsTaskResponse toResponse(TtsTask task) {
|
private TtsTaskResponse toResponse(TtsTask task) {
|
||||||
@@ -253,6 +304,23 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
|
|||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private TtsEngineClient.SynthesisOptions resolveOptions(TtsTaskCreateRequest request) {
|
||||||
|
return resolveOptions(request.getSpeechRate(), request.getPitch(), request.getEmotion());
|
||||||
|
}
|
||||||
|
|
||||||
|
private TtsEngineClient.SynthesisOptions resolveOptions(Double requestSpeechRate, Double requestPitch, String requestEmotion) {
|
||||||
|
double speechRate = requestSpeechRate == null ? defaultOrFallback(defaultSpeechRate, FALLBACK_SPEECH_RATE) : requestSpeechRate;
|
||||||
|
double pitch = requestPitch == null ? defaultPitch : requestPitch;
|
||||||
|
String emotion = StringUtils.hasText(requestEmotion)
|
||||||
|
? requestEmotion.trim()
|
||||||
|
: (StringUtils.hasText(defaultEmotion) ? defaultEmotion.trim() : FALLBACK_EMOTION);
|
||||||
|
return new TtsEngineClient.SynthesisOptions(
|
||||||
|
round(clamp(speechRate, 0.60D, 1.40D)),
|
||||||
|
round(clamp(pitch, -20D, 20D)),
|
||||||
|
emotion.toLowerCase(Locale.ROOT)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
private String currentUserId() {
|
private String currentUserId() {
|
||||||
String userId = UserContextHolder.getCurrentUserId();
|
String userId = UserContextHolder.getCurrentUserId();
|
||||||
if (!StringUtils.hasText(userId)) {
|
if (!StringUtils.hasText(userId)) {
|
||||||
@@ -275,6 +343,91 @@ public class TtsTaskServiceImpl extends ServiceImpl<TtsTaskMapper, TtsTask> impl
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static List<String> toReadableParagraphs(String paragraph) {
|
||||||
|
List<String> parts = new ArrayList<>();
|
||||||
|
StringBuilder current = new StringBuilder();
|
||||||
|
for (int index = 0; index < paragraph.length(); index++) {
|
||||||
|
char ch = paragraph.charAt(index);
|
||||||
|
current.append(ch);
|
||||||
|
if (isHardSentenceEnd(ch) || (current.length() >= NATURAL_PARAGRAPH_LIMIT && isSoftPause(ch))) {
|
||||||
|
addReadablePart(parts, current);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
addReadablePart(parts, current);
|
||||||
|
return parts;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void addReadablePart(List<String> parts, StringBuilder current) {
|
||||||
|
String value = current.toString().trim();
|
||||||
|
current.setLength(0);
|
||||||
|
if (!StringUtils.hasText(value)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (value.length() > NATURAL_PARAGRAPH_LIMIT + 40) {
|
||||||
|
splitLongText(parts, value);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
parts.add(ensureSentenceEnding(value));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void splitLongText(List<String> parts, String value) {
|
||||||
|
StringBuilder chunk = new StringBuilder();
|
||||||
|
for (int index = 0; index < value.length(); index++) {
|
||||||
|
char ch = value.charAt(index);
|
||||||
|
chunk.append(ch);
|
||||||
|
if (chunk.length() >= NATURAL_PARAGRAPH_LIMIT) {
|
||||||
|
parts.add(ensureSentenceEnding(chunk.toString().trim()));
|
||||||
|
chunk.setLength(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (chunk.length() > 0) {
|
||||||
|
parts.add(ensureSentenceEnding(chunk.toString().trim()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String ensureSentenceEnding(String value) {
|
||||||
|
if (!StringUtils.hasText(value)) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
char last = value.charAt(value.length() - 1);
|
||||||
|
return isHardSentenceEnd(last) || isSoftPause(last) ? value : value + "。";
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isHardSentenceEnd(char ch) {
|
||||||
|
return ch == '。' || ch == '!' || ch == '?' || ch == ';' || ch == '…';
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isSoftPause(char ch) {
|
||||||
|
return ch == ',' || ch == '、' || ch == ':';
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String limitReadableText(String text, int limit) {
|
||||||
|
if (text.length() <= limit) {
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
String truncated = text.substring(0, limit);
|
||||||
|
int cut = Math.max(
|
||||||
|
Math.max(truncated.lastIndexOf('。'), truncated.lastIndexOf('!')),
|
||||||
|
Math.max(truncated.lastIndexOf('?'), truncated.lastIndexOf('\n'))
|
||||||
|
);
|
||||||
|
if (cut > limit * 0.75) {
|
||||||
|
return truncated.substring(0, cut + 1).trim();
|
||||||
|
}
|
||||||
|
return ensureSentenceEnding(truncated.trim());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static double clamp(double value, double min, double max) {
|
||||||
|
return Math.max(min, Math.min(max, value));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static double round(double value) {
|
||||||
|
return Math.round(value * 100D) / 100D;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static double defaultOrFallback(double value, double fallback) {
|
||||||
|
return value <= 0D ? fallback : value;
|
||||||
|
}
|
||||||
|
|
||||||
private static String joinPath(String prefix, String filename) {
|
private static String joinPath(String prefix, String filename) {
|
||||||
if (prefix.endsWith("/")) {
|
if (prefix.endsWith("/")) {
|
||||||
return prefix + filename;
|
return prefix + filename;
|
||||||
|
|||||||
@@ -69,6 +69,9 @@ emotion:
|
|||||||
public-url-prefix: /tts/audio
|
public-url-prefix: /tts/audio
|
||||||
max-text-length: 5000
|
max-text-length: 5000
|
||||||
default-voice: default_zh_female
|
default-voice: default_zh_female
|
||||||
|
default-speech-rate: 0.92
|
||||||
|
default-pitch: 0
|
||||||
|
default-emotion: story
|
||||||
|
|
||||||
# Speech-to-text config
|
# Speech-to-text config
|
||||||
asr:
|
asr:
|
||||||
|
|||||||
@@ -106,6 +106,9 @@ emotion:
|
|||||||
public-url-prefix: /tts/audio
|
public-url-prefix: /tts/audio
|
||||||
max-text-length: 5000
|
max-text-length: 5000
|
||||||
default-voice: default_zh_female
|
default-voice: default_zh_female
|
||||||
|
default-speech-rate: 0.92
|
||||||
|
default-pitch: 0
|
||||||
|
default-emotion: story
|
||||||
|
|
||||||
# Speech-to-text config
|
# Speech-to-text config
|
||||||
asr:
|
asr:
|
||||||
|
|||||||
@@ -12,11 +12,19 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
|
|||||||
class TtsTaskServiceTest {
|
class TtsTaskServiceTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@DisplayName("cleanText strips markdown and normalizes whitespace")
|
@DisplayName("cleanText strips markdown but keeps Chinese narration rhythm")
|
||||||
void cleanTextStripsMarkdownAndNormalizesWhitespace() {
|
void cleanTextStripsMarkdownButKeepsChineseNarrationRhythm() {
|
||||||
String cleaned = TtsTaskServiceImpl.cleanText("# Title\n\n> **hello** `world` - ok");
|
String cleaned = TtsTaskServiceImpl.cleanText("# 第一章\n\n> **她 终于** 看见了自己\n\n- 转身离开");
|
||||||
|
|
||||||
assertEquals("Title hello world ok", cleaned);
|
assertEquals("第一章。\n\n她终于看见了自己。\n\n转身离开。", cleaned);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@DisplayName("cleanText preserves sentence punctuation for natural pauses")
|
||||||
|
void cleanTextPreservesSentencePunctuationForNaturalPauses() {
|
||||||
|
String cleaned = TtsTaskServiceImpl.cleanText("他说: 这一次,我想自己选择!\n\n你听见了吗?");
|
||||||
|
|
||||||
|
assertEquals("他说:这一次,我想自己选择!\n\n你听见了吗?", cleaned);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -28,6 +36,9 @@ class TtsTaskServiceTest {
|
|||||||
@Test
|
@Test
|
||||||
@DisplayName("TtsEngineResult exposes synthesis result fields")
|
@DisplayName("TtsEngineResult exposes synthesis result fields")
|
||||||
void ttsEngineResultExposesFields() {
|
void ttsEngineResultExposesFields() {
|
||||||
|
TtsEngineClient.SynthesisOptions options = new TtsEngineClient.SynthesisOptions(0.92D, 0D, "story");
|
||||||
|
assertEquals("rate=0.92;pitch=0.0;emotion=story", options.cacheKey());
|
||||||
|
|
||||||
TtsEngineClient.TtsEngineResult result =
|
TtsEngineClient.TtsEngineResult result =
|
||||||
new TtsEngineClient.TtsEngineResult(true, "/tmp/a.mp3", 1200L, null);
|
new TtsEngineClient.TtsEngineResult(true, "/tmp/a.mp3", 1200L, null);
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import subprocess
|
import subprocess
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
@@ -16,6 +17,42 @@ class SynthesizeRequest(BaseModel):
|
|||||||
text: str = Field(min_length=1, max_length=5000)
|
text: str = Field(min_length=1, max_length=5000)
|
||||||
voice: str = "default_zh_female"
|
voice: str = "default_zh_female"
|
||||||
outputPath: str
|
outputPath: str
|
||||||
|
speechRate: Optional[float] = Field(default=0.92, ge=0.6, le=1.4)
|
||||||
|
pitch: Optional[float] = Field(default=0.0, ge=-20.0, le=20.0)
|
||||||
|
emotion: Optional[str] = "story"
|
||||||
|
|
||||||
|
|
||||||
|
def clamp(value: float, minimum: float, maximum: float) -> float:
|
||||||
|
return max(minimum, min(maximum, value))
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_piper_args(request: SynthesizeRequest) -> list[str]:
|
||||||
|
speech_rate = clamp(float(request.speechRate or 0.92), 0.6, 1.4)
|
||||||
|
emotion = (request.emotion or "story").lower()
|
||||||
|
length_scale = round(1.0 / speech_rate, 2)
|
||||||
|
sentence_silence = 0.46
|
||||||
|
noise_scale = 0.64
|
||||||
|
noise_w = 0.72
|
||||||
|
|
||||||
|
if emotion in {"calm", "soft", "warm"}:
|
||||||
|
sentence_silence = 0.5
|
||||||
|
noise_scale = 0.58
|
||||||
|
noise_w = 0.68
|
||||||
|
elif emotion in {"story", "narration", "expressive"}:
|
||||||
|
sentence_silence = 0.48
|
||||||
|
noise_scale = 0.68
|
||||||
|
noise_w = 0.76
|
||||||
|
|
||||||
|
return [
|
||||||
|
"--sentence-silence",
|
||||||
|
str(sentence_silence),
|
||||||
|
"--length_scale",
|
||||||
|
str(length_scale),
|
||||||
|
"--noise_scale",
|
||||||
|
str(noise_scale),
|
||||||
|
"--noise_w",
|
||||||
|
str(noise_w),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
@@ -47,8 +84,7 @@ def synthesize(request: SynthesizeRequest):
|
|||||||
str(PIPER_CONFIG),
|
str(PIPER_CONFIG),
|
||||||
"--output_file",
|
"--output_file",
|
||||||
str(output),
|
str(output),
|
||||||
"--sentence-silence",
|
*resolve_piper_args(request),
|
||||||
"0.35",
|
|
||||||
],
|
],
|
||||||
input=request.text,
|
input=request.text,
|
||||||
text=True,
|
text=True,
|
||||||
|
|||||||
Reference in New Issue
Block a user