|
|
@@ -5378,11 +5378,12 @@ public class ArchivesAutoServiceImpl extends BaseServiceImpl<ArchivesAutoMapper,
|
|
|
.sum();
|
|
|
}
|
|
|
|
|
|
+
|
|
|
@Override
|
|
|
@Async
|
|
|
public Boolean atuoOCR(List<Long> idsList) throws Exception {
|
|
|
- //String url="/mnt/sdc/AutoPdf/";
|
|
|
- String url="D:\\AutoPdf\\";
|
|
|
+ String url="/mnt/sdc/AutoPdf/";
|
|
|
+ //String url="D:\\AutoPdf\\";
|
|
|
List<ArchivesAuto> archivesAutoList = this.list(new LambdaQueryWrapper<ArchivesAuto>().in(ArchivesAuto::getId, idsList));
|
|
|
this.update(Wrappers.<ArchivesAuto>lambdaUpdate().set(ArchivesAuto::getColourStatus, 2).in(ArchivesAuto::getId, idsList));
|
|
|
for (ArchivesAuto auto : archivesAutoList) {
|
|
|
@@ -5403,46 +5404,61 @@ public class ArchivesAutoServiceImpl extends BaseServiceImpl<ArchivesAutoMapper,
|
|
|
if(b){
|
|
|
System.out.println("开始识别:"+filePath);
|
|
|
List<String> list = extractTextFromPDF(filePath);
|
|
|
- System.out.println("识别完成:"+list);
|
|
|
if(!list.isEmpty()){
|
|
|
StringBuilder fileName=new StringBuilder();
|
|
|
for (String result : list) {
|
|
|
+ System.out.println("识别结果:"+result);
|
|
|
+ result=result.replaceAll("\\s+", "");
|
|
|
if(result.contains("档号")){
|
|
|
String fileNum=result.replace("档号","").replace(":","").replace(":","");
|
|
|
auto.setFileNumber(fileNum);
|
|
|
}else if(result.contains("立卷单位")){
|
|
|
- String unit=result.replace("立卷单位","").replace(":","").replace(":","");
|
|
|
+ String unit=result.replace("立卷单位","").replace(":","").replace(":","").replaceAll("_","").replace("密级","").replace("级密","");
|
|
|
auto.setUnit(unit);
|
|
|
- } else if (result.contains("起止日期")) {
|
|
|
- String time=result.replace("起止日期","").replace(":","").replace(":","");
|
|
|
+ }else if (result.contains("起止日期")) {
|
|
|
+ String time=result.replace("起止日期","").replace(":","").replace(":","").replaceAll("_","");
|
|
|
if(result.contains("~")){
|
|
|
- String[] split = time.split("~");
|
|
|
- auto.setStartDate(split[0]);
|
|
|
- auto.setEndDate(split[1]);
|
|
|
+ String[] localDateTimes = result.split("~");
|
|
|
+ auto.setStartDate(localDateTimes[0]);
|
|
|
+ auto.setEndDate(localDateTimes[1]);
|
|
|
} else if (result.contains("-")) {
|
|
|
- String[] split = time.split("-");
|
|
|
- auto.setStartDate(split[0]);
|
|
|
- auto.setEndDate(split[1]);
|
|
|
+ String[] localDateTimes = result.split("-");
|
|
|
+ auto.setStartDate(localDateTimes[0]);
|
|
|
+ auto.setEndDate(localDateTimes[1]);
|
|
|
} else if (result.contains("~")) {
|
|
|
- String[] split = time.split("~");
|
|
|
- auto.setStartDate(split[0]);
|
|
|
- auto.setEndDate(split[1]);
|
|
|
+ String[] localDateTimes = result.split("~");
|
|
|
+ auto.setStartDate(localDateTimes[0]);
|
|
|
+ auto.setEndDate(localDateTimes[1]);
|
|
|
}
|
|
|
} else if (result.contains("保管期限")||result.contains("保管限期")) {
|
|
|
- String storageTime=result.replace("保管期限","").replace("保管限期","").replace(":","").replace(":","");
|
|
|
- auto.setStorageTime(storageTime);
|
|
|
+ String storageTime=result.replace("保管期限","").replace("保管限期","").replace(":","").replace(":","").replaceAll("_","");
|
|
|
+ if(StringUtils.isNotEmpty(storageTime)){
|
|
|
+ if("10年".equals(storageTime)){
|
|
|
+ auto.setStorageTime("1");
|
|
|
+ } else if("30年".equals(storageTime)){
|
|
|
+ auto.setStorageTime("2");
|
|
|
+ } else{
|
|
|
+ auto.setStorageTime("3");
|
|
|
+ }
|
|
|
+ }
|
|
|
} else if (result.contains("密1")||result.contains("密级")) {
|
|
|
- String secretLevel=result.replace("密1","").replace("密级","");
|
|
|
+ String secretLevel=result.replace("密1","").replace("密级","").replaceAll("_","");
|
|
|
auto.setSecretLevel(secretLevel);
|
|
|
} else {
|
|
|
- fileName.append(result);
|
|
|
+ if(StringUtils.isNotEmpty(result)){
|
|
|
+ fileName.append(result);
|
|
|
+ System.out.println("文件题名识别结果:"+fileName);
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
- auto.setName(fileName.toString());
|
|
|
+ System.out.println("文件题名最终结果:===================="+fileName);
|
|
|
+ if(StringUtils.isNotEmpty(fileName.toString())){
|
|
|
+ auto.setName(fileName.toString().replace("密级","").replace("级密",""));
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}catch (Exception e){
|
|
|
-
|
|
|
+ e.printStackTrace();
|
|
|
}finally {
|
|
|
FileUtils.removeFile(filePath);
|
|
|
String updateSql="update u_archives_auto set colour_status=1 where id="+auto.getId();
|
|
|
@@ -5452,105 +5468,65 @@ public class ArchivesAutoServiceImpl extends BaseServiceImpl<ArchivesAutoMapper,
|
|
|
this.updateBatchById(archivesAutoList);
|
|
|
return true;
|
|
|
}
|
|
|
-
|
|
|
- public static LocalDateTime[] convertDateRange(String dateRange,String split) {
|
|
|
- String[] dates = dateRange.split(split);
|
|
|
-
|
|
|
- DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyyMMdd");
|
|
|
-
|
|
|
- LocalDate startLocalDate = LocalDate.parse(dates[0], formatter);
|
|
|
- LocalDate endLocalDate = LocalDate.parse(dates[1], formatter);
|
|
|
-
|
|
|
- LocalDateTime startDateTime = startLocalDate.atStartOfDay(); // 00:00:00
|
|
|
- LocalDateTime endDateTime = endLocalDate.atStartOfDay();; // 00:00:00
|
|
|
-
|
|
|
- return new LocalDateTime[]{startDateTime, endDateTime};
|
|
|
- }
|
|
|
- // 自定义实体类(与 JSON 结构匹配)
|
|
|
- static class OcrResponse {
|
|
|
- private String status;
|
|
|
- private String message;
|
|
|
- private List<String> lines;
|
|
|
-
|
|
|
- public String getStatus() { return status; }
|
|
|
- public void setStatus(String status) { this.status = status; }
|
|
|
- public String getMessage() { return message; }
|
|
|
- public void setMessage(String message) { this.message = message; }
|
|
|
- public List<String> getLines() { return lines; }
|
|
|
- public void setLines(List<String> lines) { this.lines = lines; }
|
|
|
- }
|
|
|
-
|
|
|
public List<String> extractTextFromPDF(String pdfFilePath) throws IOException, InterruptedException {
|
|
|
- // 1. 配置路径(确保正确)
|
|
|
- String PYTHON_SCRIPT_PATH = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\Python\\pdf.py";
|
|
|
- String PYTHON_INTERPRETER = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\python.exe";
|
|
|
+ //String PYTHON_SCRIPT_PATH = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\Python\\pdfTextExtractorWindows.py";
|
|
|
+ //String PYTHON_INTERPRETER = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\python.exe";
|
|
|
+ System.out.println("进入识别1");
|
|
|
+ String PYTHON_SCRIPT_PATH = "/www/wwwlogs/python/pdfTextExtractorWindows.py";
|
|
|
+ String PYTHON_INTERPRETER = "python3";
|
|
|
+ String[] command = {
|
|
|
+ PYTHON_INTERPRETER,
|
|
|
+ PYTHON_SCRIPT_PATH,
|
|
|
+ pdfFilePath
|
|
|
+ };
|
|
|
|
|
|
- // 2. 构建命令
|
|
|
- String[] command = {PYTHON_INTERPRETER, PYTHON_SCRIPT_PATH, pdfFilePath};
|
|
|
-
|
|
|
- // 3. 执行进程(分离输出流)
|
|
|
Process process = new ProcessBuilder(command)
|
|
|
- .redirectErrorStream(false)
|
|
|
+ .redirectErrorStream(true)
|
|
|
.start();
|
|
|
+ System.out.println("进入识别2");
|
|
|
+ // 读取Python输出
|
|
|
+ StringBuilder output = new StringBuilder();
|
|
|
+ try (InputStream inputStream = process.getInputStream();
|
|
|
+ BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"))) {
|
|
|
|
|
|
- // 4. 读取 JSON 结果(stdout)
|
|
|
- StringBuilder jsonOutput = new StringBuilder();
|
|
|
- try (BufferedReader reader = new BufferedReader(
|
|
|
- new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8))) {
|
|
|
String line;
|
|
|
while ((line = reader.readLine()) != null) {
|
|
|
- jsonOutput.append(line.trim()); // 去除多余空格/换行
|
|
|
+ output.append(line);
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
- // 5. 读取错误日志(stderr,用于调试)
|
|
|
- StringBuilder errorOutput = new StringBuilder();
|
|
|
- try (BufferedReader errorReader = new BufferedReader(
|
|
|
- new InputStreamReader(process.getErrorStream(), StandardCharsets.UTF_8))) {
|
|
|
- String line;
|
|
|
- while ((line = errorReader.readLine()) != null) {
|
|
|
- errorOutput.append(line).append("\n");
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- // 6. 检查进程退出码
|
|
|
+ System.out.println("进入识别3");
|
|
|
int exitCode = process.waitFor();
|
|
|
if (exitCode != 0) {
|
|
|
- throw new RuntimeException(
|
|
|
- "Python脚本执行失败(退出码:" + exitCode + ")\n" +
|
|
|
- "错误日志:" + errorOutput.toString() + "\n" +
|
|
|
- "标准输出:" + jsonOutput.toString()
|
|
|
- );
|
|
|
- }
|
|
|
-
|
|
|
- // 7. 解析 JSON(核心修复:用实体类直接解析)
|
|
|
- String jsonStr = jsonOutput.toString();
|
|
|
- if (jsonStr.isEmpty()) {
|
|
|
- throw new RuntimeException("Python脚本未输出任何结果(JSON 为空)");
|
|
|
- }
|
|
|
-
|
|
|
- // 提取纯 JSON 字符串(过滤可能的前缀日志)
|
|
|
- int jsonStart = jsonStr.indexOf('{');
|
|
|
- int jsonEnd = jsonStr.lastIndexOf('}');
|
|
|
- if (jsonStart == -1 || jsonEnd == -1) {
|
|
|
- throw new RuntimeException(
|
|
|
- "无法提取JSON结果,原始输出:" + jsonStr + "\n" +
|
|
|
- "Python错误日志:" + errorOutput.toString()
|
|
|
- );
|
|
|
- }
|
|
|
- jsonStr = jsonStr.substring(jsonStart, jsonEnd + 1);
|
|
|
-
|
|
|
- // 直接解析为实体类,避免 Map 转换问题
|
|
|
+ System.out.println("进入识别4");
|
|
|
+ throw new RuntimeException("Python脚本执行失败,退出码: " + exitCode + ", 输出: " + output.toString());
|
|
|
+ }
|
|
|
+
|
|
|
+ // -------------------------- 关键修改:提取纯JSON部分 --------------------------
|
|
|
+ String rawOutput = output.toString();
|
|
|
+ // 找到JSON的起始位置(第一个'{')和结束位置(最后一个'}')
|
|
|
+ int jsonStart = rawOutput.indexOf('{');
|
|
|
+ int jsonEnd = rawOutput.lastIndexOf('}');
|
|
|
+ System.out.println("进入识别5");
|
|
|
+ if (jsonStart == -1 || jsonEnd == -1 || jsonStart >= jsonEnd) {
|
|
|
+ System.out.println("进入识别6");
|
|
|
+ throw new RuntimeException("无法提取有效的JSON结果,原始输出: " + rawOutput);
|
|
|
+ }
|
|
|
+ // 截取纯JSON字符串
|
|
|
+ String jsonStr = rawOutput.substring(jsonStart, jsonEnd + 1);
|
|
|
+ System.out.println("进入识别7");
|
|
|
+ // 解析清理后的JSON
|
|
|
Gson gson = new Gson();
|
|
|
- OcrResponse ocrResponse = gson.fromJson(jsonStr, OcrResponse.class);
|
|
|
+ Type type = new TypeToken<Map<String, Object>>(){}.getType();
|
|
|
+ Map<String, Object> resultMap = gson.fromJson(jsonStr, type);
|
|
|
|
|
|
- // 8. 校验结果状态
|
|
|
- if (!"success".equals(ocrResponse.getStatus())) {
|
|
|
- throw new RuntimeException("PDF处理失败: " + ocrResponse.getMessage() + "\nPython日志:" + errorOutput.toString());
|
|
|
+ if (!"success".equals(resultMap.get("status"))) {
|
|
|
+ System.out.println("进入识别8");
|
|
|
+ String message = (String) resultMap.get("message");
|
|
|
+ throw new RuntimeException("处理PDF失败: " + (message != null ? message : "未知错误"));
|
|
|
}
|
|
|
-
|
|
|
- // 9. 返回 List<String>(直接从实体类获取)
|
|
|
- return ocrResponse.getLines();
|
|
|
+ System.out.println("进入识别9");
|
|
|
+ Type listType = new TypeToken<List<String>>(){}.getType();
|
|
|
+ return gson.fromJson(gson.toJson(resultMap.get("lines")), listType);
|
|
|
}
|
|
|
|
|
|
@Scheduled(fixedDelay = 1000 * 60 * 10)
|