Pārlūkot izejas kodu

档案案卷识别

cr 1 dienu atpakaļ
vecāks
revīzija
43cdb5746f

+ 69 - 37
blade-service/blade-archive/src/main/java/org/springblade/archive/service/impl/ArchivesAutoServiceImpl.java

@@ -5382,8 +5382,8 @@ public class ArchivesAutoServiceImpl extends BaseServiceImpl<ArchivesAutoMapper,
 	@Override
 	@Async
 	public Boolean atuoOCR(List<Long> idsList) throws Exception {
-		String url="/mnt/sdc/AutoPdf/";
-		//String url="D:\\AutoPdf\\";
+		//String url="/mnt/sdc/AutoPdf/";
+		String url="D:\\AutoPdf\\";
 		List<ArchivesAuto> archivesAutoList = this.list(new LambdaQueryWrapper<ArchivesAuto>().in(ArchivesAuto::getId, idsList));
 		this.update(Wrappers.<ArchivesAuto>lambdaUpdate().set(ArchivesAuto::getColourStatus, 2).in(ArchivesAuto::getId, idsList));
 		for (ArchivesAuto auto : archivesAutoList) {
@@ -5467,60 +5467,92 @@ public class ArchivesAutoServiceImpl extends BaseServiceImpl<ArchivesAutoMapper,
 
 		return new LocalDateTime[]{startDateTime, endDateTime};
 	}
+	// 自定义实体类(与 JSON 结构匹配)
+	static class OcrResponse {
+		private String status;
+		private String message;
+		private List<String> lines;
+
+		public String getStatus() { return status; }
+		public void setStatus(String status) { this.status = status; }
+		public String getMessage() { return message; }
+		public void setMessage(String message) { this.message = message; }
+		public List<String> getLines() { return lines; }
+		public void setLines(List<String> lines) { this.lines = lines; }
+	}
 
 	public List<String> extractTextFromPDF(String pdfFilePath) throws IOException, InterruptedException {
-		//String PYTHON_SCRIPT_PATH = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\Python\\pdfTextExtractorWindows.py";
-		//String PYTHON_INTERPRETER = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\python.exe";
-		String PYTHON_SCRIPT_PATH = "/www/wwwlogs/python/pdfTextExtractorWindows.py";
-		String PYTHON_INTERPRETER = "python3";
-		String[] command = {
-				PYTHON_INTERPRETER,
-				PYTHON_SCRIPT_PATH,
-				pdfFilePath
-		};
+		// 1. 配置路径(确保正确)
+		String PYTHON_SCRIPT_PATH = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\Python\\pdf.py";
+		String PYTHON_INTERPRETER = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\python.exe";
+
+		// 2. 构建命令
+		String[] command = {PYTHON_INTERPRETER, PYTHON_SCRIPT_PATH, pdfFilePath};
 
+		// 3. 执行进程(分离输出流)
 		Process process = new ProcessBuilder(command)
-				.redirectErrorStream(true)
+				.redirectErrorStream(false)
 				.start();
-		// 读取Python输出
-		StringBuilder output = new StringBuilder();
-		try (InputStream inputStream = process.getInputStream();
-			 BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"))) {
 
+		// 4. 读取 JSON 结果(stdout)
+		StringBuilder jsonOutput = new StringBuilder();
+		try (BufferedReader reader = new BufferedReader(
+				new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8))) {
 			String line;
 			while ((line = reader.readLine()) != null) {
-				output.append(line);
+				jsonOutput.append(line.trim()); // 去除多余空格/换行
 			}
 		}
+
+		// 5. 读取错误日志(stderr,用于调试)
+		StringBuilder errorOutput = new StringBuilder();
+		try (BufferedReader errorReader = new BufferedReader(
+				new InputStreamReader(process.getErrorStream(), StandardCharsets.UTF_8))) {
+			String line;
+			while ((line = errorReader.readLine()) != null) {
+				errorOutput.append(line).append("\n");
+			}
+		}
+
+		// 6. 检查进程退出码
 		int exitCode = process.waitFor();
 		if (exitCode != 0) {
-			throw new RuntimeException("Python脚本执行失败,退出码: " + exitCode + ", 输出: " + output.toString());
+			throw new RuntimeException(
+					"Python脚本执行失败(退出码:" + exitCode + ")\n" +
+							"错误日志:" + errorOutput.toString() + "\n" +
+							"标准输出:" + jsonOutput.toString()
+			);
 		}
 
-		// -------------------------- 关键修改:提取纯JSON部分 --------------------------
-		String rawOutput = output.toString();
-		// 找到JSON的起始位置(第一个'{')和结束位置(最后一个'}')
-		int jsonStart = rawOutput.indexOf('{');
-		int jsonEnd = rawOutput.lastIndexOf('}');
-		if (jsonStart == -1 || jsonEnd == -1 || jsonStart >= jsonEnd) {
-			throw new RuntimeException("无法提取有效的JSON结果,原始输出: " + rawOutput);
+		// 7. 解析 JSON(核心修复:用实体类直接解析)
+		String jsonStr = jsonOutput.toString();
+		if (jsonStr.isEmpty()) {
+			throw new RuntimeException("Python脚本未输出任何结果(JSON 为空)");
 		}
-		// 截取纯JSON字符串
-		String jsonStr = rawOutput.substring(jsonStart, jsonEnd + 1);
-		// 解析清理后的JSON
-		Gson gson = new Gson();
-		Type type = new TypeToken<Map<String, Object>>(){}.getType();
-		Map<String, Object> resultMap = gson.fromJson(jsonStr, type);
 
-		if (!"success".equals(resultMap.get("status"))) {
-			String message = (String) resultMap.get("message");
-			throw new RuntimeException("处理PDF失败: " + (message != null ? message : "未知错误"));
+		// 提取纯 JSON 字符串(过滤可能的前缀日志)
+		int jsonStart = jsonStr.indexOf('{');
+		int jsonEnd = jsonStr.lastIndexOf('}');
+		if (jsonStart == -1 || jsonEnd == -1) {
+			throw new RuntimeException(
+					"无法提取JSON结果,原始输出:" + jsonStr + "\n" +
+							"Python错误日志:" + errorOutput.toString()
+			);
 		}
-		Type listType = new TypeToken<List<String>>(){}.getType();
-		return gson.fromJson(gson.toJson(resultMap.get("lines")), listType);
-	}
+		jsonStr = jsonStr.substring(jsonStart, jsonEnd + 1);
+
+		// 直接解析为实体类,避免 Map 转换问题
+		Gson gson = new Gson();
+		OcrResponse ocrResponse = gson.fromJson(jsonStr, OcrResponse.class);
 
+		// 8. 校验结果状态
+		if (!"success".equals(ocrResponse.getStatus())) {
+			throw new RuntimeException("PDF处理失败: " + ocrResponse.getMessage() + "\nPython日志:" + errorOutput.toString());
+		}
 
+		// 9. 返回 List<String>(直接从实体类获取)
+		return ocrResponse.getLines();
+	}
 
 	@Scheduled(fixedDelay = 1000 * 60 * 10)
 	public void reCreateArchiveAuto() {