语音转文本提取为通用方法

2026-01-04 22:06:14 +08:00
parent 31561465a8
commit 1167707631
1 changed files with 177 additions and 0 deletions
--- a/src/main/java/com/rj/service/impl/TtsRequestLogServiceImpl.java
+++ b/src/main/java/com/rj/service/impl/TtsRequestLogServiceImpl.java
@@ -4,6 +4,7 @@ import com.alibaba.dashscope.audio.asr.transcription.*;
 import com.rj.entity.TtsRequestLog;
 import com.rj.mapper.TtsRequestLogMapper;
 import com.rj.service.ITtsRequestLogService;
+import com.rj.service.MinIOService;
 import com.rj.utils.MinIOUrlGenerator;
 import com.rj.dto.AsrRequest;
 import com.rj.dto.AsrResponse;
@@ -12,9 +13,13 @@ import lombok.extern.slf4j.Slf4j;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.beans.factory.annotation.Value;
 import org.springframework.stereotype.Service;
+import org.springframework.web.multipart.MultipartFile;
+import org.springframework.mock.web.MockMultipartFile;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.JsonNode;

+import java.io.File;
+import java.nio.file.Files;
 import java.time.LocalDateTime;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -36,6 +41,9 @@ public class TtsRequestLogServiceImpl implements ITtsRequestLogService {
    @Autowired
    private MinIOUrlGenerator urlGenerator;
    
+    @Autowired
+    private MinIOService minIOService;
+    
    @Value("${dashscope.api.key}")
    private String apiKey;
    
@@ -111,6 +119,175 @@ public class TtsRequestLogServiceImpl implements ITtsRequestLogService {
        return false;
    }
    
+    /**
+     * 音频转文本的公共方法
+     * 支持本地文件路径或URL
+     * 
+     * @param audioPathOrUrl 音频文件路径（本地路径）或URL
+     * @return 转换后的文本内容，失败时返回null
+     */
+    public String transcribeAudioToText(String audioPathOrUrl) {
+        return transcribeAudioToText(audioPathOrUrl, defaultAsrModel);
+    }
+    
+    /**
+     * 音频转文本的公共方法
+     * 支持本地文件路径或URL
+     * 
+     * @param audioPathOrUrl 音频文件路径（本地路径）或URL
+     * @param model ASR模型名称，如果为null则使用默认模型
+     * @return 转换后的文本内容，失败时返回null
+     */
+    public String transcribeAudioToText(String audioPathOrUrl, String model) {
+        if (audioPathOrUrl == null || audioPathOrUrl.trim().isEmpty()) {
+            log.error("音频文件路径或URL不能为空");
+            return null;
+        }
+        
+        if (!isAsrServiceAvailable()) {
+            log.error("ASR服务不可用，请检查配置");
+            return null;
+        }
+        
+        try {
+            // 判断是URL还是本地文件路径
+            String audioUrl = audioPathOrUrl;
+            
+            // 判断是否为URL（以http://或https://开头）
+            if (!audioPathOrUrl.startsWith("http://") && !audioPathOrUrl.startsWith("https://")) {
+                log.info("检测到本地文件路径: {}", audioPathOrUrl);
+                
+                // 检查文件是否存在
+                File file = new File(audioPathOrUrl);
+                if (!file.exists() || !file.isFile()) {
+                    log.error("本地文件不存在或不是文件: {}", audioPathOrUrl);
+                    return null;
+                }
+                
+                // 上传本地文件到MinIO获取URL
+                try {
+                    String fileName = file.getName();
+                    String contentType = getContentType(fileName);
+                    
+                    // 读取文件内容
+                    byte[] fileBytes = Files.readAllBytes(file.toPath());
+                    
+                    // 创建MultipartFile对象
+                    MultipartFile multipartFile = new MockMultipartFile(
+                            "file",
+                            fileName,
+                            contentType,
+                            fileBytes
+                    );
+                    
+                    // 上传到MinIO
+                    audioUrl = minIOService.uploadFile(multipartFile);
+                    log.info("本地文件已上传到MinIO，URL: {}", audioUrl);
+                } catch (Exception e) {
+                    log.error("上传本地文件到MinIO失败: {}", e.getMessage(), e);
+                    return null;
+                }
+            } else {
+                log.info("使用音频URL: {}", audioUrl);
+            }
+            
+            // 使用默认模型或指定模型
+            String asrModel = (model != null && !model.trim().isEmpty()) ? model : defaultAsrModel;
+            
+            log.info("开始语音识别 - 音频URL: {}, 模型: {}", audioUrl, asrModel);
+            
+            // 创建转写请求参数
+            TranscriptionParam param = TranscriptionParam.builder()
+                    .apiKey(apiKey)
+                    .model(asrModel)
+                    .fileUrls(Arrays.asList(audioUrl))
+                    .build();
+            
+            // 创建转写对象
+            Transcription transcription = new Transcription();
+            
+            // 提交转写请求
+            TranscriptionResult result = transcription.asyncCall(param);
+            String taskId = result.getTaskId();
+            log.info("ASR任务已提交 - TaskId: {}", taskId);
+            
+            // 等待任务完成
+            TranscriptionQueryParam queryParam = TranscriptionQueryParam.FromTranscriptionParam(param, taskId);
+            result = transcription.wait(queryParam);
+            log.info("语音识别完成 - TaskId: {}", taskId);
+            
+            if (result.getResults() != null && !result.getResults().isEmpty()) {
+                // 解析识别结果
+                StringBuilder fullText = new StringBuilder();
+                
+                // 遍历所有结果，获取transcriptionUrl并下载识别文本
+                for (TranscriptionTaskResult taskResult : result.getResults()) {
+                    if (taskResult.getTranscriptionUrl() != null) {
+                        try {
+                            // 从transcriptionUrl下载识别结果
+                            String transcriptionResult = downloadTranscriptionResult(taskResult.getTranscriptionUrl());
+                            
+                            if (transcriptionResult != null && !transcriptionResult.trim().isEmpty()) {
+                                // 解析transcription结果JSON
+                                String extractedText = parseTranscriptionResult(transcriptionResult);
+                                if (extractedText != null && !extractedText.trim().isEmpty()) {
+                                    if (fullText.length() > 0) {
+                                        fullText.append(" ");
+                                    }
+                                    fullText.append(extractedText);
+                                }
+                            }
+                        } catch (Exception e) {
+                            log.warn("下载transcription结果失败: {}", e.getMessage());
+                        }
+                    }
+                }
+                
+                String resultText = fullText.toString().trim();
+                if (resultText.isEmpty()) {
+                    log.warn("ASR识别结果为空 - TaskId: {}", taskId);
+                    return null;
+                }
+                
+                log.info("语音识别完成 - 识别文本长度: {}, TaskId: {}", resultText.length(), taskId);
+                return resultText;
+            } else {
+                log.warn("ASR识别结果为空 - TaskId: {}", taskId);
+                return null;
+            }
+            
+        } catch (Exception e) {
+            log.error("语音识别失败: {}", e.getMessage(), e);
+            return null;
+        }
+    }
+    
+    /**
+     * 根据文件名获取Content-Type
+     */
+    private String getContentType(String fileName) {
+        if (fileName == null) {
+            return "application/octet-stream";
+        }
+        String lowerName = fileName.toLowerCase();
+        if (lowerName.endsWith(".mp3")) {
+            return "audio/mpeg";
+        } else if (lowerName.endsWith(".wav")) {
+            return "audio/wav";
+        } else if (lowerName.endsWith(".m4a")) {
+            return "audio/mp4";
+        } else if (lowerName.endsWith(".aac")) {
+            return "audio/aac";
+        } else if (lowerName.endsWith(".ogg")) {
+            return "audio/ogg";
+        } else if (lowerName.endsWith(".flac")) {
+            return "audio/flac";
+        } else if (lowerName.endsWith(".wma")) {
+            return "audio/x-ms-wma";
+        }
+        return "application/octet-stream";
+    }
+    
    @Override
    public AsrResponse speechToText(AsrRequest request) {
        long startTime = System.currentTimeMillis();