diff --git a/backend-single/src/main/java/com/emotion/controller/TtsController.java b/backend-single/src/main/java/com/emotion/controller/TtsController.java index 02fefa0..d1284f2 100644 --- a/backend-single/src/main/java/com/emotion/controller/TtsController.java +++ b/backend-single/src/main/java/com/emotion/controller/TtsController.java @@ -61,8 +61,11 @@ public class TtsController { @GetMapping("/tasks/by-source") public Result bySource(@Parameter(description = "来源类型") @RequestParam String sourceType, @Parameter(description = "来源 ID") @RequestParam String sourceId, - @Parameter(description = "音色") @RequestParam(required = false) String voice) { - return Result.success(ttsTaskService.getBySource(sourceType, sourceId, voice)); + @Parameter(description = "音色") @RequestParam(required = false) String voice, + @Parameter(description = "语速") @RequestParam(required = false) Double speechRate, + @Parameter(description = "音调") @RequestParam(required = false) Double pitch, + @Parameter(description = "情绪") @RequestParam(required = false) String emotion) { + return Result.success(ttsTaskService.getBySource(sourceType, sourceId, voice, speechRate, pitch, emotion)); } @Operation(summary = "获取音频文件", description = "返回已合成的音频音频文件(MP3 或 WAV 格式)。") diff --git a/backend-single/src/main/java/com/emotion/dto/request/tts/TtsTaskCreateRequest.java b/backend-single/src/main/java/com/emotion/dto/request/tts/TtsTaskCreateRequest.java index 0577ce4..83b6aef 100644 --- a/backend-single/src/main/java/com/emotion/dto/request/tts/TtsTaskCreateRequest.java +++ b/backend-single/src/main/java/com/emotion/dto/request/tts/TtsTaskCreateRequest.java @@ -3,6 +3,8 @@ package com.emotion.dto.request.tts; import lombok.Data; import javax.validation.constraints.NotBlank; +import javax.validation.constraints.DecimalMax; +import javax.validation.constraints.DecimalMin; import javax.validation.constraints.Size; @Data @@ -18,4 +20,15 @@ public class TtsTaskCreateRequest { @Size(max = 64) private String voice; + + @DecimalMin("0.60") + @DecimalMax("1.40") + private Double speechRate; + + @DecimalMin("-20.00") + @DecimalMax("20.00") + private Double pitch; + + @Size(max = 32) + private String emotion; } diff --git a/backend-single/src/main/java/com/emotion/dto/response/tts/TtsTaskResponse.java b/backend-single/src/main/java/com/emotion/dto/response/tts/TtsTaskResponse.java index 309e9a5..b17d1f9 100644 --- a/backend-single/src/main/java/com/emotion/dto/response/tts/TtsTaskResponse.java +++ b/backend-single/src/main/java/com/emotion/dto/response/tts/TtsTaskResponse.java @@ -12,6 +12,9 @@ public class TtsTaskResponse { private String sourceId; private String status; private String voice; + private Double speechRate; + private Double pitch; + private String emotion; private String audioUrl; private Long durationMs; private String errorMessage; diff --git a/backend-single/src/main/java/com/emotion/service/TtsEngineClient.java b/backend-single/src/main/java/com/emotion/service/TtsEngineClient.java index f1dc82a..7441cb2 100644 --- a/backend-single/src/main/java/com/emotion/service/TtsEngineClient.java +++ b/backend-single/src/main/java/com/emotion/service/TtsEngineClient.java @@ -2,7 +2,37 @@ package com.emotion.service; public interface TtsEngineClient { - TtsEngineResult synthesize(String text, String voice, String outputPath); + TtsEngineResult synthesize(String text, String voice, String outputPath, SynthesisOptions options); + + class SynthesisOptions { + private final Double speechRate; + private final Double pitch; + private final String emotion; + + public SynthesisOptions(Double speechRate, Double pitch, String emotion) { + this.speechRate = speechRate; + this.pitch = pitch; + this.emotion = emotion; + } + + public Double getSpeechRate() { + return speechRate; + } + + public Double getPitch() { + return pitch; + } + + public String getEmotion() { + return emotion; + } + + public String cacheKey() { + return "rate=" + (speechRate == null ? "" : speechRate) + + ";pitch=" + (pitch == null ? "" : pitch) + + ";emotion=" + (emotion == null ? "" : emotion); + } + } class TtsEngineResult { private final boolean success; diff --git a/backend-single/src/main/java/com/emotion/service/TtsTaskService.java b/backend-single/src/main/java/com/emotion/service/TtsTaskService.java index 3f629cc..a218d87 100644 --- a/backend-single/src/main/java/com/emotion/service/TtsTaskService.java +++ b/backend-single/src/main/java/com/emotion/service/TtsTaskService.java @@ -11,5 +11,6 @@ public interface TtsTaskService extends IService { TtsTaskResponse getTask(String id); - TtsTaskResponse getBySource(String sourceType, String sourceId, String voice); + TtsTaskResponse getBySource(String sourceType, String sourceId, String voice, + Double speechRate, Double pitch, String emotion); } diff --git a/backend-single/src/main/java/com/emotion/service/impl/HttpTtsEngineClient.java b/backend-single/src/main/java/com/emotion/service/impl/HttpTtsEngineClient.java index 18aed35..227a3e5 100644 --- a/backend-single/src/main/java/com/emotion/service/impl/HttpTtsEngineClient.java +++ b/backend-single/src/main/java/com/emotion/service/impl/HttpTtsEngineClient.java @@ -5,7 +5,9 @@ import org.springframework.beans.factory.annotation.Value; import org.springframework.http.ResponseEntity; import org.springframework.stereotype.Service; import org.springframework.web.client.RestTemplate; +import org.springframework.util.StringUtils; +import java.util.HashMap; import java.util.Map; @Service @@ -21,13 +23,23 @@ public class HttpTtsEngineClient implements TtsEngineClient { } @Override - public TtsEngineResult synthesize(String text, String voice, String outputPath) { + public TtsEngineResult synthesize(String text, String voice, String outputPath, SynthesisOptions options) { try { - Map body = Map.of( - "text", text, - "voice", voice, - "outputPath", outputPath - ); + Map body = new HashMap<>(); + body.put("text", text); + body.put("voice", voice); + body.put("outputPath", outputPath); + if (options != null) { + if (options.getSpeechRate() != null) { + body.put("speechRate", options.getSpeechRate()); + } + if (options.getPitch() != null) { + body.put("pitch", options.getPitch()); + } + if (StringUtils.hasText(options.getEmotion())) { + body.put("emotion", options.getEmotion()); + } + } ResponseEntity response = restTemplate.postForEntity(engineUrl + "/synthesize", body, Map.class); Map data = response.getBody(); boolean success = data != null && Boolean.TRUE.equals(data.get("success")); diff --git a/backend-single/src/main/java/com/emotion/service/impl/TtsTaskServiceImpl.java b/backend-single/src/main/java/com/emotion/service/impl/TtsTaskServiceImpl.java index 5cd0c5a..5585d8c 100644 --- a/backend-single/src/main/java/com/emotion/service/impl/TtsTaskServiceImpl.java +++ b/backend-single/src/main/java/com/emotion/service/impl/TtsTaskServiceImpl.java @@ -19,6 +19,9 @@ import org.springframework.util.DigestUtils; import org.springframework.util.StringUtils; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.concurrent.CompletableFuture; import java.util.concurrent.Executor; @@ -32,6 +35,10 @@ public class TtsTaskServiceImpl extends ServiceImpl impl private static final String STATUS_PROCESSING = "processing"; private static final String STATUS_SUCCESS = "success"; private static final String STATUS_FAILED = "failed"; + private static final double FALLBACK_SPEECH_RATE = 0.92D; + private static final double FALLBACK_PITCH = 0D; + private static final String FALLBACK_EMOTION = "story"; + private static final int NATURAL_PARAGRAPH_LIMIT = 140; private final EpicScriptMapper epicScriptMapper; private final TtsEngineClient ttsEngineClient; @@ -52,6 +59,15 @@ public class TtsTaskServiceImpl extends ServiceImpl impl @Value("${emotion.tts.default-voice:default_zh_female}") private String defaultVoice; + @Value("${emotion.tts.default-speech-rate:0.92}") + private double defaultSpeechRate; + + @Value("${emotion.tts.default-pitch:0}") + private double defaultPitch; + + @Value("${emotion.tts.default-emotion:story}") + private String defaultEmotion; + public TtsTaskServiceImpl(EpicScriptMapper epicScriptMapper, TtsEngineClient ttsEngineClient, @Qualifier("taskExecutor") Executor taskExecutor) { @@ -70,15 +86,16 @@ public class TtsTaskServiceImpl extends ServiceImpl impl String sourceType = normalizeSourceType(request.getSourceType()); String sourceId = request.getSourceId().trim(); String voice = resolveVoice(request.getVoice()); + TtsEngineClient.SynthesisOptions options = resolveOptions(request); String cleaned = cleanText(loadSourceText(userId, sourceType, sourceId)); if (!StringUtils.hasText(cleaned)) { throw new IllegalArgumentException("Source text is empty"); } if (cleaned.length() > maxTextLength) { - cleaned = cleaned.substring(0, maxTextLength); + cleaned = limitReadableText(cleaned, maxTextLength); } - String hash = DigestUtils.md5DigestAsHex((voice + "\n" + cleaned).getBytes(StandardCharsets.UTF_8)); + String hash = DigestUtils.md5DigestAsHex((voice + "\n" + options.cacheKey() + "\n" + cleaned).getBytes(StandardCharsets.UTF_8)); TtsTask owned = findOwnedTask(userId, sourceType, sourceId, voice, hash); if (owned != null) { incrementRequestCount(owned); @@ -100,7 +117,7 @@ public class TtsTaskServiceImpl extends ServiceImpl impl TtsTask task = buildTask(userId, sourceType, sourceId, voice, hash, cleaned.length()); save(task); String synthesisText = cleaned; - CompletableFuture.runAsync(() -> process(task.getId(), synthesisText, voice, task.getAudioPath()), taskExecutor); + CompletableFuture.runAsync(() -> process(task.getId(), synthesisText, voice, task.getAudioPath(), options), taskExecutor); return toResponse(task); } @@ -115,20 +132,22 @@ public class TtsTaskServiceImpl extends ServiceImpl impl } @Override - public TtsTaskResponse getBySource(String sourceType, String sourceId, String voice) { + public TtsTaskResponse getBySource(String sourceType, String sourceId, String voice, + Double speechRate, Double pitch, String emotion) { String userId = currentUserId(); - TtsTask task = getOne(new LambdaQueryWrapper() - .eq(TtsTask::getUserId, userId) - .eq(TtsTask::getSourceType, normalizeSourceType(sourceType)) - .eq(TtsTask::getSourceId, sourceId) - .eq(TtsTask::getVoice, resolveVoice(voice)) - .eq(TtsTask::getIsDeleted, 0) - .orderByDesc(TtsTask::getCreateTime) - .last("LIMIT 1")); + String normalizedSourceType = normalizeSourceType(sourceType); + String normalizedVoice = resolveVoice(voice); + TtsEngineClient.SynthesisOptions options = resolveOptions(speechRate, pitch, emotion); + String cleaned = cleanText(loadSourceText(userId, normalizedSourceType, sourceId)); + if (cleaned.length() > maxTextLength) { + cleaned = limitReadableText(cleaned, maxTextLength); + } + String hash = DigestUtils.md5DigestAsHex((normalizedVoice + "\n" + options.cacheKey() + "\n" + cleaned).getBytes(StandardCharsets.UTF_8)); + TtsTask task = findOwnedTask(userId, normalizedSourceType, sourceId, normalizedVoice, hash); return task == null ? null : toResponse(task); } - private void process(String taskId, String text, String voice, String outputPath) { + private void process(String taskId, String text, String voice, String outputPath, TtsEngineClient.SynthesisOptions options) { try { TtsTask task = getById(taskId); if (task == null) { @@ -138,7 +157,7 @@ public class TtsTaskServiceImpl extends ServiceImpl impl task.setErrorMessage(null); updateById(task); - TtsEngineClient.TtsEngineResult result = ttsEngineClient.synthesize(text, voice, outputPath); + TtsEngineClient.TtsEngineResult result = ttsEngineClient.synthesize(text, voice, outputPath, options); task = getById(taskId); if (task == null) { return; @@ -220,13 +239,15 @@ public class TtsTaskServiceImpl extends ServiceImpl impl StringBuilder text = new StringBuilder(); append(text, script.getTitle()); - append(text, script.getPlotIntro()); - append(text, script.getPlotTurning()); - append(text, script.getPlotClimax()); - append(text, script.getPlotEnding()); Map plotJson = script.getPlotJson(); - if (plotJson != null && plotJson.get("fullContent") != null) { - append(text, String.valueOf(plotJson.get("fullContent"))); + Object fullContent = plotJson == null ? null : plotJson.get("fullContent"); + if (fullContent != null && StringUtils.hasText(String.valueOf(fullContent))) { + append(text, String.valueOf(fullContent)); + } else { + append(text, script.getPlotIntro()); + append(text, script.getPlotTurning()); + append(text, script.getPlotClimax()); + append(text, script.getPlotEnding()); } return text.toString(); } @@ -235,9 +256,39 @@ public class TtsTaskServiceImpl extends ServiceImpl impl if (text == null) { return ""; } - return text.replaceAll("[#>*_`\\-]", "") - .replaceAll("\\s+", " ") - .trim(); + String normalized = text.replace("\r\n", "\n") + .replace('\r', '\n') + .replaceAll("!\\[[^\\]]*]\\([^)]*\\)", "") + .replaceAll("\\[([^\\]]+)]\\([^)]*\\)", "$1") + .replaceAll("(?m)^\\s{0,3}#{1,6}\\s*", "") + .replaceAll("(?m)^\\s*>\\s?", "") + .replaceAll("(?m)^\\s*[-*+]\\s+", "") + .replaceAll("(?m)^\\s*\\d+[.)、]\\s+", "") + .replaceAll("<[^>]+>", "") + .replaceAll("[*_`~]", "") + .replaceAll("[“”]", "\"") + .replaceAll("[‘’]", "'") + .replaceAll("\\.{3,}", "……") + .replaceAll("-{2,}", ",") + .replaceAll("[\\t\\u00A0]+", " ") + .replaceAll(" {2,}", " ") + .replaceAll("(?<=[\\p{IsHan}])[ \\t\\u00A0]+(?=[\\p{IsHan}])", "") + .replaceAll("[ \\t\\u00A0]*([,。!?;:、,.!?;:])[ \\t\\u00A0]*", "$1") + .replaceAll(",", ",") + .replaceAll("!", "!") + .replaceAll("\\?", "?") + .replaceAll(";", ";") + .replaceAll(":", ":"); + + List paragraphs = new ArrayList<>(); + for (String paragraph : normalized.split("\\n+")) { + String trimmed = paragraph.trim(); + if (!StringUtils.hasText(trimmed)) { + continue; + } + paragraphs.addAll(toReadableParagraphs(trimmed)); + } + return String.join("\n\n", paragraphs).trim(); } private TtsTaskResponse toResponse(TtsTask task) { @@ -253,6 +304,23 @@ public class TtsTaskServiceImpl extends ServiceImpl impl .build(); } + private TtsEngineClient.SynthesisOptions resolveOptions(TtsTaskCreateRequest request) { + return resolveOptions(request.getSpeechRate(), request.getPitch(), request.getEmotion()); + } + + private TtsEngineClient.SynthesisOptions resolveOptions(Double requestSpeechRate, Double requestPitch, String requestEmotion) { + double speechRate = requestSpeechRate == null ? defaultOrFallback(defaultSpeechRate, FALLBACK_SPEECH_RATE) : requestSpeechRate; + double pitch = requestPitch == null ? defaultPitch : requestPitch; + String emotion = StringUtils.hasText(requestEmotion) + ? requestEmotion.trim() + : (StringUtils.hasText(defaultEmotion) ? defaultEmotion.trim() : FALLBACK_EMOTION); + return new TtsEngineClient.SynthesisOptions( + round(clamp(speechRate, 0.60D, 1.40D)), + round(clamp(pitch, -20D, 20D)), + emotion.toLowerCase(Locale.ROOT) + ); + } + private String currentUserId() { String userId = UserContextHolder.getCurrentUserId(); if (!StringUtils.hasText(userId)) { @@ -275,6 +343,91 @@ public class TtsTaskServiceImpl extends ServiceImpl impl } } + private static List toReadableParagraphs(String paragraph) { + List parts = new ArrayList<>(); + StringBuilder current = new StringBuilder(); + for (int index = 0; index < paragraph.length(); index++) { + char ch = paragraph.charAt(index); + current.append(ch); + if (isHardSentenceEnd(ch) || (current.length() >= NATURAL_PARAGRAPH_LIMIT && isSoftPause(ch))) { + addReadablePart(parts, current); + } + } + addReadablePart(parts, current); + return parts; + } + + private static void addReadablePart(List parts, StringBuilder current) { + String value = current.toString().trim(); + current.setLength(0); + if (!StringUtils.hasText(value)) { + return; + } + if (value.length() > NATURAL_PARAGRAPH_LIMIT + 40) { + splitLongText(parts, value); + return; + } + parts.add(ensureSentenceEnding(value)); + } + + private static void splitLongText(List parts, String value) { + StringBuilder chunk = new StringBuilder(); + for (int index = 0; index < value.length(); index++) { + char ch = value.charAt(index); + chunk.append(ch); + if (chunk.length() >= NATURAL_PARAGRAPH_LIMIT) { + parts.add(ensureSentenceEnding(chunk.toString().trim())); + chunk.setLength(0); + } + } + if (chunk.length() > 0) { + parts.add(ensureSentenceEnding(chunk.toString().trim())); + } + } + + private static String ensureSentenceEnding(String value) { + if (!StringUtils.hasText(value)) { + return ""; + } + char last = value.charAt(value.length() - 1); + return isHardSentenceEnd(last) || isSoftPause(last) ? value : value + "。"; + } + + private static boolean isHardSentenceEnd(char ch) { + return ch == '。' || ch == '!' || ch == '?' || ch == ';' || ch == '…'; + } + + private static boolean isSoftPause(char ch) { + return ch == ',' || ch == '、' || ch == ':'; + } + + private static String limitReadableText(String text, int limit) { + if (text.length() <= limit) { + return text; + } + String truncated = text.substring(0, limit); + int cut = Math.max( + Math.max(truncated.lastIndexOf('。'), truncated.lastIndexOf('!')), + Math.max(truncated.lastIndexOf('?'), truncated.lastIndexOf('\n')) + ); + if (cut > limit * 0.75) { + return truncated.substring(0, cut + 1).trim(); + } + return ensureSentenceEnding(truncated.trim()); + } + + private static double clamp(double value, double min, double max) { + return Math.max(min, Math.min(max, value)); + } + + private static double round(double value) { + return Math.round(value * 100D) / 100D; + } + + private static double defaultOrFallback(double value, double fallback) { + return value <= 0D ? fallback : value; + } + private static String joinPath(String prefix, String filename) { if (prefix.endsWith("/")) { return prefix + filename; diff --git a/backend-single/src/main/resources/application-prod.yml b/backend-single/src/main/resources/application-prod.yml index 9f0fd54..672b00a 100644 --- a/backend-single/src/main/resources/application-prod.yml +++ b/backend-single/src/main/resources/application-prod.yml @@ -69,6 +69,9 @@ emotion: public-url-prefix: /tts/audio max-text-length: 5000 default-voice: default_zh_female + default-speech-rate: 0.92 + default-pitch: 0 + default-emotion: story # Speech-to-text config asr: diff --git a/backend-single/src/main/resources/application.yml b/backend-single/src/main/resources/application.yml index 60ed752..9fce6b9 100644 --- a/backend-single/src/main/resources/application.yml +++ b/backend-single/src/main/resources/application.yml @@ -106,6 +106,9 @@ emotion: public-url-prefix: /tts/audio max-text-length: 5000 default-voice: default_zh_female + default-speech-rate: 0.92 + default-pitch: 0 + default-emotion: story # Speech-to-text config asr: diff --git a/backend-single/src/test/java/com/emotion/service/TtsTaskServiceTest.java b/backend-single/src/test/java/com/emotion/service/TtsTaskServiceTest.java index 231a97a..97ce6e1 100644 --- a/backend-single/src/test/java/com/emotion/service/TtsTaskServiceTest.java +++ b/backend-single/src/test/java/com/emotion/service/TtsTaskServiceTest.java @@ -12,11 +12,19 @@ import static org.junit.jupiter.api.Assertions.assertTrue; class TtsTaskServiceTest { @Test - @DisplayName("cleanText strips markdown and normalizes whitespace") - void cleanTextStripsMarkdownAndNormalizesWhitespace() { - String cleaned = TtsTaskServiceImpl.cleanText("# Title\n\n> **hello** `world` - ok"); + @DisplayName("cleanText strips markdown but keeps Chinese narration rhythm") + void cleanTextStripsMarkdownButKeepsChineseNarrationRhythm() { + String cleaned = TtsTaskServiceImpl.cleanText("# 第一章\n\n> **她 终于** 看见了自己\n\n- 转身离开"); - assertEquals("Title hello world ok", cleaned); + assertEquals("第一章。\n\n她终于看见了自己。\n\n转身离开。", cleaned); + } + + @Test + @DisplayName("cleanText preserves sentence punctuation for natural pauses") + void cleanTextPreservesSentencePunctuationForNaturalPauses() { + String cleaned = TtsTaskServiceImpl.cleanText("他说: 这一次,我想自己选择!\n\n你听见了吗?"); + + assertEquals("他说:这一次,我想自己选择!\n\n你听见了吗?", cleaned); } @Test @@ -28,6 +36,9 @@ class TtsTaskServiceTest { @Test @DisplayName("TtsEngineResult exposes synthesis result fields") void ttsEngineResultExposesFields() { + TtsEngineClient.SynthesisOptions options = new TtsEngineClient.SynthesisOptions(0.92D, 0D, "story"); + assertEquals("rate=0.92;pitch=0.0;emotion=story", options.cacheKey()); + TtsEngineClient.TtsEngineResult result = new TtsEngineClient.TtsEngineResult(true, "/tmp/a.mp3", 1200L, null); diff --git a/backend-single/tts-service/app.py b/backend-single/tts-service/app.py index e6af167..35460ad 100644 --- a/backend-single/tts-service/app.py +++ b/backend-single/tts-service/app.py @@ -1,5 +1,6 @@ import subprocess from pathlib import Path +from typing import Optional from fastapi import FastAPI from pydantic import BaseModel, Field @@ -16,6 +17,42 @@ class SynthesizeRequest(BaseModel): text: str = Field(min_length=1, max_length=5000) voice: str = "default_zh_female" outputPath: str + speechRate: Optional[float] = Field(default=0.92, ge=0.6, le=1.4) + pitch: Optional[float] = Field(default=0.0, ge=-20.0, le=20.0) + emotion: Optional[str] = "story" + + +def clamp(value: float, minimum: float, maximum: float) -> float: + return max(minimum, min(maximum, value)) + + +def resolve_piper_args(request: SynthesizeRequest) -> list[str]: + speech_rate = clamp(float(request.speechRate or 0.92), 0.6, 1.4) + emotion = (request.emotion or "story").lower() + length_scale = round(1.0 / speech_rate, 2) + sentence_silence = 0.46 + noise_scale = 0.64 + noise_w = 0.72 + + if emotion in {"calm", "soft", "warm"}: + sentence_silence = 0.5 + noise_scale = 0.58 + noise_w = 0.68 + elif emotion in {"story", "narration", "expressive"}: + sentence_silence = 0.48 + noise_scale = 0.68 + noise_w = 0.76 + + return [ + "--sentence-silence", + str(sentence_silence), + "--length_scale", + str(length_scale), + "--noise_scale", + str(noise_scale), + "--noise_w", + str(noise_w), + ] @app.get("/health") @@ -47,8 +84,7 @@ def synthesize(request: SynthesizeRequest): str(PIPER_CONFIG), "--output_file", str(output), - "--sentence-silence", - "0.35", + *resolve_piper_args(request), ], input=request.text, text=True,