hai 2 días · 9d976d7f23
--- a/blade-service/blade-archive/src/main/java/org/springblade/archive/service/impl/ArchivesAutoServiceImpl.java
+++ b/blade-service/blade-archive/src/main/java/org/springblade/archive/service/impl/ArchivesAutoServiceImpl.java
@@ -106,6 +106,7 @@ import java.io.*;
 
				 import java.math.BigDecimal;
			
 
				 import java.net.URL;
			
 
				 import java.net.URLEncoder;
			
 
				+import java.nio.charset.StandardCharsets;
			
 
				 import java.nio.file.Files;
			
 
				 import java.nio.file.Path;
			
 
				 import java.nio.file.Paths;
			
@@ -5381,9 +5382,8 @@ public class ArchivesAutoServiceImpl extends BaseServiceImpl<ArchivesAutoMapper,
 
				 	@Override
			
 
				 	@Async
			
 
				 	public Boolean atuoOCR(List<Long> idsList) throws Exception {
			
 
				-		String url="/mnt/sdc/AutoPdf/";
			
 
				-		//String url="D:\\AutoPdf\\";
			
 
				-		//List<Long> idsList=Func.toLongList(ids);
			
 
				+		//String url="/mnt/sdc/AutoPdf/";
			
 
				+		String url="D:\\AutoPdf\\";
			
 
				 		List<ArchivesAuto> archivesAutoList = this.list(new LambdaQueryWrapper<ArchivesAuto>().in(ArchivesAuto::getId, idsList));
			
 
				 		this.update(Wrappers.<ArchivesAuto>lambdaUpdate().set(ArchivesAuto::getColourStatus, 2).in(ArchivesAuto::getId, idsList));
			
 
				 		for (ArchivesAuto auto : archivesAutoList) {
			
@@ -5468,66 +5468,160 @@ public class ArchivesAutoServiceImpl extends BaseServiceImpl<ArchivesAutoMapper,
 
				 		return new LocalDateTime[]{startDateTime, endDateTime};
			
 
				 	}
			
 
				 
			
 
				-	public List<String> extractTextFromPDF(String pdfFilePath) throws IOException, InterruptedException {
			
 
				-		//String PYTHON_SCRIPT_PATH = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\Python\\pdfTextExtractorWindows.py";
			
 
				-		//String PYTHON_INTERPRETER = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\python.exe";
			
 
				-		System.out.println("进入识别1");
			
 
				-		String PYTHON_SCRIPT_PATH = "/www/wwwlogs/python/pdfTextExtractorWindows.py";
			
 
				-		String PYTHON_INTERPRETER = "python3";
			
 
				-		String[] command = {
			
 
				-				PYTHON_INTERPRETER,
			
 
				-				PYTHON_SCRIPT_PATH,
			
 
				-				pdfFilePath
			
 
				-		};
			
 
				-
			
 
				-		Process process = new ProcessBuilder(command)
			
 
				-				.redirectErrorStream(true)
			
 
				-				.start();
			
 
				-		System.out.println("进入识别2");
			
 
				-		// 读取Python输出
			
 
				-		StringBuilder output = new StringBuilder();
			
 
				-		try (InputStream inputStream = process.getInputStream();
			
 
				-			 BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"))) {
			
 
				-
			
 
				-			String line;
			
 
				-			while ((line = reader.readLine()) != null) {
			
 
				-				output.append(line);
			
 
				-			}
			
 
				-		}
			
 
				-		System.out.println("进入识别3");
			
 
				-		int exitCode = process.waitFor();
			
 
				-		if (exitCode != 0) {
			
 
				-			System.out.println("进入识别4");
			
 
				-			throw new RuntimeException("Python脚本执行失败，退出码: " + exitCode + ", 输出: " + output.toString());
			
 
				-		}
			
 
				-
			
 
				-		// -------------------------- 关键修改：提取纯JSON部分 --------------------------
			
 
				-		String rawOutput = output.toString();
			
 
				-		// 找到JSON的起始位置（第一个'{'）和结束位置（最后一个'}'）
			
 
				-		int jsonStart = rawOutput.indexOf('{');
			
 
				-		int jsonEnd = rawOutput.lastIndexOf('}');
			
 
				-		System.out.println("进入识别5");
			
 
				-		if (jsonStart == -1 || jsonEnd == -1 || jsonStart >= jsonEnd) {
			
 
				-			System.out.println("进入识别6");
			
 
				-			throw new RuntimeException("无法提取有效的JSON结果，原始输出: " + rawOutput);
			
 
				-		}
			
 
				-		// 截取纯JSON字符串
			
 
				-		String jsonStr = rawOutput.substring(jsonStart, jsonEnd + 1);
			
 
				-		System.out.println("进入识别7");
			
 
				-		// 解析清理后的JSON
			
 
				-		Gson gson = new Gson();
			
 
				-		Type type = new TypeToken<Map<String, Object>>(){}.getType();
			
 
				-		Map<String, Object> resultMap = gson.fromJson(jsonStr, type);
			
 
				-
			
 
				-		if (!"success".equals(resultMap.get("status"))) {
			
 
				-			System.out.println("进入识别8");
			
 
				+//	public List<String> extractTextFromPDF(String pdfFilePath) throws IOException, InterruptedException {
			
 
				+//		//String PYTHON_SCRIPT_PATH = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\Python\\pdfTextExtractorWindows.py";
			
 
				+//		//String PYTHON_INTERPRETER = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\python.exe";
			
 
				+//		String PYTHON_SCRIPT_PATH = "/www/wwwlogs/python/pdfTextExtractorWindows.py";
			
 
				+//		String PYTHON_INTERPRETER = "python3";
			
 
				+//		String[] command = {
			
 
				+//				PYTHON_INTERPRETER,
			
 
				+//				PYTHON_SCRIPT_PATH,
			
 
				+//				pdfFilePath
			
 
				+//		};
			
 
				+//
			
 
				+//		Process process = new ProcessBuilder(command)
			
 
				+//				.redirectErrorStream(true)
			
 
				+//				.start();
			
 
				+//		// 读取Python输出
			
 
				+//		StringBuilder output = new StringBuilder();
			
 
				+//		try (InputStream inputStream = process.getInputStream();
			
 
				+//			 BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"))) {
			
 
				+//
			
 
				+//			String line;
			
 
				+//			while ((line = reader.readLine()) != null) {
			
 
				+//				output.append(line);
			
 
				+//			}
			
 
				+//		}
			
 
				+//		int exitCode = process.waitFor();
			
 
				+//		if (exitCode != 0) {
			
 
				+//			throw new RuntimeException("Python脚本执行失败，退出码: " + exitCode + ", 输出: " + output.toString());
			
 
				+//		}
			
 
				+//
			
 
				+//		// -------------------------- 关键修改：提取纯JSON部分 --------------------------
			
 
				+//		String rawOutput = output.toString();
			
 
				+//		// 找到JSON的起始位置（第一个'{'）和结束位置（最后一个'}'）
			
 
				+//		int jsonStart = rawOutput.indexOf('{');
			
 
				+//		int jsonEnd = rawOutput.lastIndexOf('}');
			
 
				+//		if (jsonStart == -1 || jsonEnd == -1 || jsonStart >= jsonEnd) {
			
 
				+//			throw new RuntimeException("无法提取有效的JSON结果，原始输出: " + rawOutput);
			
 
				+//		}
			
 
				+//		// 截取纯JSON字符串
			
 
				+//		String jsonStr = rawOutput.substring(jsonStart, jsonEnd + 1);
			
 
				+//		// 解析清理后的JSON
			
 
				+//		Gson gson = new Gson();
			
 
				+//		Type type = new TypeToken<Map<String, Object>>(){}.getType();
			
 
				+//		Map<String, Object> resultMap = gson.fromJson(jsonStr, type);
			
 
				+//
			
 
				+//		if (!"success".equals(resultMap.get("status"))) {
			
 
				+//			String message = (String) resultMap.get("message");
			
 
				+//			throw new RuntimeException("处理PDF失败: " + (message != null ? message : "未知错误"));
			
 
				+//		}
			
 
				+//		Type listType = new TypeToken<List<String>>(){}.getType();
			
 
				+//		return gson.fromJson(gson.toJson(resultMap.get("lines")), listType);
			
 
				+//	}
			
 
				+public List<String> extractTextFromPDF(String pdfFilePath) throws IOException, InterruptedException {
			
 
				+	// 修正：atuoOCR.py -> autoOCR.py
			
 
				+	String PYTHON_SCRIPT_PATH = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\Python\\autoOCR.py";
			
 
				+	String PYTHON_INTERPRETER = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\python.exe";
			
 
				+	// 检查文件是否存在
			
 
				+	File scriptFile = new File(PYTHON_SCRIPT_PATH);
			
 
				+	if (!scriptFile.exists()) {
			
 
				+		throw new RuntimeException("Python脚本不存在。请检查路径: " + PYTHON_SCRIPT_PATH);
			
 
				+	}
			
 
				+	System.out.println("使用Python解释器: " + PYTHON_INTERPRETER);
			
 
				+	System.out.println("使用Python脚本: " + PYTHON_SCRIPT_PATH);
			
 
				+
			
 
				+	String[] command = {
			
 
				+			PYTHON_INTERPRETER,
			
 
				+			PYTHON_SCRIPT_PATH,
			
 
				+			pdfFilePath
			
 
				+	};
			
 
				+	// 打印执行的命令用于调试
			
 
				+	System.out.print("执行的命令: ");
			
 
				+	for (String cmd : command) {
			
 
				+		System.out.print(cmd + " ");
			
 
				+	}
			
 
				+	System.out.println();
			
 
				+
			
 
				+	ProcessBuilder processBuilder = new ProcessBuilder(command);
			
 
				+	processBuilder.redirectErrorStream(true);
			
 
				+
			
 
				+	// 设置工作目录为Python脚本所在目录
			
 
				+	File scriptDir = new File(PYTHON_SCRIPT_PATH).getParentFile();
			
 
				+	processBuilder.directory(scriptDir);
			
 
				+
			
 
				+	System.out.println("工作目录: " + scriptDir.getAbsolutePath());
			
 
				+
			
 
				+	Process process = processBuilder.start();
			
 
				+
			
 
				+	// 读取输出
			
 
				+	StringBuilder output = new StringBuilder();
			
 
				+	try (BufferedReader reader = new BufferedReader(
			
 
				+			new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8))) {
			
 
				+		String line;
			
 
				+		while ((line = reader.readLine()) != null) {
			
 
				+			output.append(line).append("\n");
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	int exitCode = process.waitFor();
			
 
				+
			
 
				+	System.out.println("Python脚本执行完成，退出码: " + exitCode);
			
 
				+	System.out.println("原始输出: \n" + output.toString());
			
 
				+
			
 
				+	if (exitCode != 0) {
			
 
				+		// 尝试解析错误信息
			
 
				+		String errorMsg = "Python脚本执行失败，退出码: " + exitCode;
			
 
				+
			
 
				+		// 检查是否是常见错误
			
 
				+		if (output.toString().contains("No such file or directory")) {
			
 
				+			errorMsg = "Python脚本文件未找到，请检查路径: " + PYTHON_SCRIPT_PATH;
			
 
				+		} else if (output.toString().contains("ImportError") || output.toString().contains("ModuleNotFoundError")) {
			
 
				+			errorMsg = "Python模块未安装。请运行: pip install PyMuPDF opencv-python paddleocr paddlepaddle numpy";
			
 
				+		}
			
 
				+
			
 
				+		throw new RuntimeException(errorMsg);
			
 
				+	}
			
 
				+
			
 
				+	// 解析JSON结果
			
 
				+	Gson gson = new Gson();
			
 
				+	String jsonStr = output.toString().trim();
			
 
				+
			
 
				+	if (jsonStr.isEmpty()) {
			
 
				+		throw new RuntimeException("Python脚本没有返回任何内容");
			
 
				+	}
			
 
				+
			
 
				+	try {
			
 
				+		Map<String, Object> resultMap = gson.fromJson(jsonStr,
			
 
				+				new TypeToken<Map<String, Object>>(){}.getType());
			
 
				+
			
 
				+		if (resultMap == null) {
			
 
				+			throw new RuntimeException("无法解析JSON结果: " + jsonStr);
			
 
				+		}
			
 
				+
			
 
				+		String status = (String) resultMap.get("status");
			
 
				+		if (!"success".equals(status)) {
			
 
				 			String message = (String) resultMap.get("message");
			
 
				-			throw new RuntimeException("处理PDF失败: " + (message != null ? message : "未知错误"));
			
 
				+			throw new RuntimeException("处理失败: " + (message != null ? message : "未知错误"));
			
 
				+		}
			
 
				+
			
 
				+		// 提取文本行
			
 
				+		List<String> lines = new ArrayList<>();
			
 
				+		List<?> linesData = (List<?>) resultMap.get("lines");
			
 
				+		if (linesData != null) {
			
 
				+			for (Object lineObj : linesData) {
			
 
				+				if (lineObj != null && !lineObj.toString().trim().isEmpty()) {
			
 
				+					lines.add(lineObj.toString().trim());
			
 
				+				}
			
 
				+			}
			
 
				 		}
			
 
				-		System.out.println("进入识别9");
			
 
				-		Type listType = new TypeToken<List<String>>(){}.getType();
			
 
				-		return gson.fromJson(gson.toJson(resultMap.get("lines")), listType);
			
 
				+
			
 
				+		return lines;
			
 
				+
			
 
				+	} catch (Exception e) {
			
 
				+		throw new RuntimeException("解析结果失败: " + e.getMessage() + "\n原始输出: " + jsonStr);
			
 
				 	}
			
 
				+}
			
 
				 
			
 
				 	@Scheduled(fixedDelay = 1000 * 60 * 10)
			
 
				 	public void reCreateArchiveAuto() {