|
|
@@ -106,6 +106,7 @@ import java.io.*;
|
|
|
import java.math.BigDecimal;
|
|
|
import java.net.URL;
|
|
|
import java.net.URLEncoder;
|
|
|
+import java.nio.charset.StandardCharsets;
|
|
|
import java.nio.file.Files;
|
|
|
import java.nio.file.Path;
|
|
|
import java.nio.file.Paths;
|
|
|
@@ -5381,9 +5382,8 @@ public class ArchivesAutoServiceImpl extends BaseServiceImpl<ArchivesAutoMapper,
|
|
|
@Override
|
|
|
@Async
|
|
|
public Boolean atuoOCR(List<Long> idsList) throws Exception {
|
|
|
- String url="/mnt/sdc/AutoPdf/";
|
|
|
- //String url="D:\\AutoPdf\\";
|
|
|
- //List<Long> idsList=Func.toLongList(ids);
|
|
|
+ //String url="/mnt/sdc/AutoPdf/";
|
|
|
+ String url="D:\\AutoPdf\\";
|
|
|
List<ArchivesAuto> archivesAutoList = this.list(new LambdaQueryWrapper<ArchivesAuto>().in(ArchivesAuto::getId, idsList));
|
|
|
this.update(Wrappers.<ArchivesAuto>lambdaUpdate().set(ArchivesAuto::getColourStatus, 2).in(ArchivesAuto::getId, idsList));
|
|
|
for (ArchivesAuto auto : archivesAutoList) {
|
|
|
@@ -5468,66 +5468,160 @@ public class ArchivesAutoServiceImpl extends BaseServiceImpl<ArchivesAutoMapper,
|
|
|
return new LocalDateTime[]{startDateTime, endDateTime};
|
|
|
}
|
|
|
|
|
|
- public List<String> extractTextFromPDF(String pdfFilePath) throws IOException, InterruptedException {
|
|
|
- //String PYTHON_SCRIPT_PATH = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\Python\\pdfTextExtractorWindows.py";
|
|
|
- //String PYTHON_INTERPRETER = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\python.exe";
|
|
|
- System.out.println("进入识别1");
|
|
|
- String PYTHON_SCRIPT_PATH = "/www/wwwlogs/python/pdfTextExtractorWindows.py";
|
|
|
- String PYTHON_INTERPRETER = "python3";
|
|
|
- String[] command = {
|
|
|
- PYTHON_INTERPRETER,
|
|
|
- PYTHON_SCRIPT_PATH,
|
|
|
- pdfFilePath
|
|
|
- };
|
|
|
-
|
|
|
- Process process = new ProcessBuilder(command)
|
|
|
- .redirectErrorStream(true)
|
|
|
- .start();
|
|
|
- System.out.println("进入识别2");
|
|
|
- // 读取Python输出
|
|
|
- StringBuilder output = new StringBuilder();
|
|
|
- try (InputStream inputStream = process.getInputStream();
|
|
|
- BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"))) {
|
|
|
-
|
|
|
- String line;
|
|
|
- while ((line = reader.readLine()) != null) {
|
|
|
- output.append(line);
|
|
|
- }
|
|
|
- }
|
|
|
- System.out.println("进入识别3");
|
|
|
- int exitCode = process.waitFor();
|
|
|
- if (exitCode != 0) {
|
|
|
- System.out.println("进入识别4");
|
|
|
- throw new RuntimeException("Python脚本执行失败,退出码: " + exitCode + ", 输出: " + output.toString());
|
|
|
- }
|
|
|
-
|
|
|
- // -------------------------- 关键修改:提取纯JSON部分 --------------------------
|
|
|
- String rawOutput = output.toString();
|
|
|
- // 找到JSON的起始位置(第一个'{')和结束位置(最后一个'}')
|
|
|
- int jsonStart = rawOutput.indexOf('{');
|
|
|
- int jsonEnd = rawOutput.lastIndexOf('}');
|
|
|
- System.out.println("进入识别5");
|
|
|
- if (jsonStart == -1 || jsonEnd == -1 || jsonStart >= jsonEnd) {
|
|
|
- System.out.println("进入识别6");
|
|
|
- throw new RuntimeException("无法提取有效的JSON结果,原始输出: " + rawOutput);
|
|
|
- }
|
|
|
- // 截取纯JSON字符串
|
|
|
- String jsonStr = rawOutput.substring(jsonStart, jsonEnd + 1);
|
|
|
- System.out.println("进入识别7");
|
|
|
- // 解析清理后的JSON
|
|
|
- Gson gson = new Gson();
|
|
|
- Type type = new TypeToken<Map<String, Object>>(){}.getType();
|
|
|
- Map<String, Object> resultMap = gson.fromJson(jsonStr, type);
|
|
|
-
|
|
|
- if (!"success".equals(resultMap.get("status"))) {
|
|
|
- System.out.println("进入识别8");
|
|
|
+// public List<String> extractTextFromPDF(String pdfFilePath) throws IOException, InterruptedException {
|
|
|
+// //String PYTHON_SCRIPT_PATH = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\Python\\pdfTextExtractorWindows.py";
|
|
|
+// //String PYTHON_INTERPRETER = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\python.exe";
|
|
|
+// String PYTHON_SCRIPT_PATH = "/www/wwwlogs/python/pdfTextExtractorWindows.py";
|
|
|
+// String PYTHON_INTERPRETER = "python3";
|
|
|
+// String[] command = {
|
|
|
+// PYTHON_INTERPRETER,
|
|
|
+// PYTHON_SCRIPT_PATH,
|
|
|
+// pdfFilePath
|
|
|
+// };
|
|
|
+//
|
|
|
+// Process process = new ProcessBuilder(command)
|
|
|
+// .redirectErrorStream(true)
|
|
|
+// .start();
|
|
|
+// // 读取Python输出
|
|
|
+// StringBuilder output = new StringBuilder();
|
|
|
+// try (InputStream inputStream = process.getInputStream();
|
|
|
+// BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"))) {
|
|
|
+//
|
|
|
+// String line;
|
|
|
+// while ((line = reader.readLine()) != null) {
|
|
|
+// output.append(line);
|
|
|
+// }
|
|
|
+// }
|
|
|
+// int exitCode = process.waitFor();
|
|
|
+// if (exitCode != 0) {
|
|
|
+// throw new RuntimeException("Python脚本执行失败,退出码: " + exitCode + ", 输出: " + output.toString());
|
|
|
+// }
|
|
|
+//
|
|
|
+// // -------------------------- 关键修改:提取纯JSON部分 --------------------------
|
|
|
+// String rawOutput = output.toString();
|
|
|
+// // 找到JSON的起始位置(第一个'{')和结束位置(最后一个'}')
|
|
|
+// int jsonStart = rawOutput.indexOf('{');
|
|
|
+// int jsonEnd = rawOutput.lastIndexOf('}');
|
|
|
+// if (jsonStart == -1 || jsonEnd == -1 || jsonStart >= jsonEnd) {
|
|
|
+// throw new RuntimeException("无法提取有效的JSON结果,原始输出: " + rawOutput);
|
|
|
+// }
|
|
|
+// // 截取纯JSON字符串
|
|
|
+// String jsonStr = rawOutput.substring(jsonStart, jsonEnd + 1);
|
|
|
+// // 解析清理后的JSON
|
|
|
+// Gson gson = new Gson();
|
|
|
+// Type type = new TypeToken<Map<String, Object>>(){}.getType();
|
|
|
+// Map<String, Object> resultMap = gson.fromJson(jsonStr, type);
|
|
|
+//
|
|
|
+// if (!"success".equals(resultMap.get("status"))) {
|
|
|
+// String message = (String) resultMap.get("message");
|
|
|
+// throw new RuntimeException("处理PDF失败: " + (message != null ? message : "未知错误"));
|
|
|
+// }
|
|
|
+// Type listType = new TypeToken<List<String>>(){}.getType();
|
|
|
+// return gson.fromJson(gson.toJson(resultMap.get("lines")), listType);
|
|
|
+// }
|
|
|
+public List<String> extractTextFromPDF(String pdfFilePath) throws IOException, InterruptedException {
|
|
|
+ // 修正:atuoOCR.py -> autoOCR.py
|
|
|
+ String PYTHON_SCRIPT_PATH = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\Python\\autoOCR.py";
|
|
|
+ String PYTHON_INTERPRETER = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\python.exe";
|
|
|
+ // 检查文件是否存在
|
|
|
+ File scriptFile = new File(PYTHON_SCRIPT_PATH);
|
|
|
+ if (!scriptFile.exists()) {
|
|
|
+ throw new RuntimeException("Python脚本不存在。请检查路径: " + PYTHON_SCRIPT_PATH);
|
|
|
+ }
|
|
|
+ System.out.println("使用Python解释器: " + PYTHON_INTERPRETER);
|
|
|
+ System.out.println("使用Python脚本: " + PYTHON_SCRIPT_PATH);
|
|
|
+
|
|
|
+ String[] command = {
|
|
|
+ PYTHON_INTERPRETER,
|
|
|
+ PYTHON_SCRIPT_PATH,
|
|
|
+ pdfFilePath
|
|
|
+ };
|
|
|
+ // 打印执行的命令用于调试
|
|
|
+ System.out.print("执行的命令: ");
|
|
|
+ for (String cmd : command) {
|
|
|
+ System.out.print(cmd + " ");
|
|
|
+ }
|
|
|
+ System.out.println();
|
|
|
+
|
|
|
+ ProcessBuilder processBuilder = new ProcessBuilder(command);
|
|
|
+ processBuilder.redirectErrorStream(true);
|
|
|
+
|
|
|
+ // 设置工作目录为Python脚本所在目录
|
|
|
+ File scriptDir = new File(PYTHON_SCRIPT_PATH).getParentFile();
|
|
|
+ processBuilder.directory(scriptDir);
|
|
|
+
|
|
|
+ System.out.println("工作目录: " + scriptDir.getAbsolutePath());
|
|
|
+
|
|
|
+ Process process = processBuilder.start();
|
|
|
+
|
|
|
+ // 读取输出
|
|
|
+ StringBuilder output = new StringBuilder();
|
|
|
+ try (BufferedReader reader = new BufferedReader(
|
|
|
+ new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8))) {
|
|
|
+ String line;
|
|
|
+ while ((line = reader.readLine()) != null) {
|
|
|
+ output.append(line).append("\n");
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ int exitCode = process.waitFor();
|
|
|
+
|
|
|
+ System.out.println("Python脚本执行完成,退出码: " + exitCode);
|
|
|
+ System.out.println("原始输出: \n" + output.toString());
|
|
|
+
|
|
|
+ if (exitCode != 0) {
|
|
|
+ // 尝试解析错误信息
|
|
|
+ String errorMsg = "Python脚本执行失败,退出码: " + exitCode;
|
|
|
+
|
|
|
+ // 检查是否是常见错误
|
|
|
+ if (output.toString().contains("No such file or directory")) {
|
|
|
+ errorMsg = "Python脚本文件未找到,请检查路径: " + PYTHON_SCRIPT_PATH;
|
|
|
+ } else if (output.toString().contains("ImportError") || output.toString().contains("ModuleNotFoundError")) {
|
|
|
+ errorMsg = "Python模块未安装。请运行: pip install PyMuPDF opencv-python paddleocr paddlepaddle numpy";
|
|
|
+ }
|
|
|
+
|
|
|
+ throw new RuntimeException(errorMsg);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 解析JSON结果
|
|
|
+ Gson gson = new Gson();
|
|
|
+ String jsonStr = output.toString().trim();
|
|
|
+
|
|
|
+ if (jsonStr.isEmpty()) {
|
|
|
+ throw new RuntimeException("Python脚本没有返回任何内容");
|
|
|
+ }
|
|
|
+
|
|
|
+ try {
|
|
|
+ Map<String, Object> resultMap = gson.fromJson(jsonStr,
|
|
|
+ new TypeToken<Map<String, Object>>(){}.getType());
|
|
|
+
|
|
|
+ if (resultMap == null) {
|
|
|
+ throw new RuntimeException("无法解析JSON结果: " + jsonStr);
|
|
|
+ }
|
|
|
+
|
|
|
+ String status = (String) resultMap.get("status");
|
|
|
+ if (!"success".equals(status)) {
|
|
|
String message = (String) resultMap.get("message");
|
|
|
- throw new RuntimeException("处理PDF失败: " + (message != null ? message : "未知错误"));
|
|
|
+ throw new RuntimeException("处理失败: " + (message != null ? message : "未知错误"));
|
|
|
+ }
|
|
|
+
|
|
|
+ // 提取文本行
|
|
|
+ List<String> lines = new ArrayList<>();
|
|
|
+ List<?> linesData = (List<?>) resultMap.get("lines");
|
|
|
+ if (linesData != null) {
|
|
|
+ for (Object lineObj : linesData) {
|
|
|
+ if (lineObj != null && !lineObj.toString().trim().isEmpty()) {
|
|
|
+ lines.add(lineObj.toString().trim());
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
- System.out.println("进入识别9");
|
|
|
- Type listType = new TypeToken<List<String>>(){}.getType();
|
|
|
- return gson.fromJson(gson.toJson(resultMap.get("lines")), listType);
|
|
|
+
|
|
|
+ return lines;
|
|
|
+
|
|
|
+ } catch (Exception e) {
|
|
|
+ throw new RuntimeException("解析结果失败: " + e.getMessage() + "\n原始输出: " + jsonStr);
|
|
|
}
|
|
|
+}
|
|
|
|
|
|
@Scheduled(fixedDelay = 1000 * 60 * 10)
|
|
|
public void reCreateArchiveAuto() {
|