Browse Source

档案案卷自动识别

cr 6 days ago
parent
commit
b6e2fb90b1

+ 82 - 106
blade-service/blade-archive/src/main/java/org/springblade/archive/service/impl/ArchivesAutoServiceImpl.java

@@ -5378,11 +5378,12 @@ public class ArchivesAutoServiceImpl extends BaseServiceImpl<ArchivesAutoMapper,
 				.sum();
 	}
 
+
 	@Override
 	@Async
 	public Boolean atuoOCR(List<Long> idsList) throws Exception {
-		//String url="/mnt/sdc/AutoPdf/";
-		String url="D:\\AutoPdf\\";
+		String url="/mnt/sdc/AutoPdf/";
+		//String url="D:\\AutoPdf\\";
 		List<ArchivesAuto> archivesAutoList = this.list(new LambdaQueryWrapper<ArchivesAuto>().in(ArchivesAuto::getId, idsList));
 		this.update(Wrappers.<ArchivesAuto>lambdaUpdate().set(ArchivesAuto::getColourStatus, 2).in(ArchivesAuto::getId, idsList));
 		for (ArchivesAuto auto : archivesAutoList) {
@@ -5403,46 +5404,61 @@ public class ArchivesAutoServiceImpl extends BaseServiceImpl<ArchivesAutoMapper,
 				if(b){
 					System.out.println("开始识别:"+filePath);
 					List<String> list = extractTextFromPDF(filePath);
-					System.out.println("识别完成:"+list);
 					if(!list.isEmpty()){
 						StringBuilder fileName=new StringBuilder();
 						for (String result : list) {
+							System.out.println("识别结果:"+result);
+							result=result.replaceAll("\\s+", "");
 							if(result.contains("档号")){
 								String fileNum=result.replace("档号","").replace(":","").replace(":","");
 								auto.setFileNumber(fileNum);
 							}else if(result.contains("立卷单位")){
-								String unit=result.replace("立卷单位","").replace(":","").replace(":","");
+								String unit=result.replace("立卷单位","").replace(":","").replace(":","").replaceAll("_","").replace("密级","").replace("级密","");
 								auto.setUnit(unit);
-							} else if (result.contains("起止日期")) {
-								String time=result.replace("起止日期","").replace(":","").replace(":","");
+							}else if (result.contains("起止日期")) {
+								String time=result.replace("起止日期","").replace(":","").replace(":","").replaceAll("_","");
 								if(result.contains("~")){
-									String[] split = time.split("~");
-									auto.setStartDate(split[0]);
-									auto.setEndDate(split[1]);
+									String[] localDateTimes = result.split("~");
+									auto.setStartDate(localDateTimes[0]);
+									auto.setEndDate(localDateTimes[1]);
 								} else if (result.contains("-")) {
-									String[] split = time.split("-");
-									auto.setStartDate(split[0]);
-									auto.setEndDate(split[1]);
+									String[] localDateTimes = result.split("-");
+									auto.setStartDate(localDateTimes[0]);
+									auto.setEndDate(localDateTimes[1]);
 								} else if (result.contains("~")) {
-									String[] split = time.split("~");
-									auto.setStartDate(split[0]);
-									auto.setEndDate(split[1]);
+									String[] localDateTimes = result.split("~");
+									auto.setStartDate(localDateTimes[0]);
+									auto.setEndDate(localDateTimes[1]);
 								}
 							} else if (result.contains("保管期限")||result.contains("保管限期")) {
-								String storageTime=result.replace("保管期限","").replace("保管限期","").replace(":","").replace(":","");
-								auto.setStorageTime(storageTime);
+								String storageTime=result.replace("保管期限","").replace("保管限期","").replace(":","").replace(":","").replaceAll("_","");
+								if(StringUtils.isNotEmpty(storageTime)){
+									if("10年".equals(storageTime)){
+										auto.setStorageTime("1");
+									} else if("30年".equals(storageTime)){
+										auto.setStorageTime("2");
+									} else{
+										auto.setStorageTime("3");
+									}
+								}
 							} else if (result.contains("密1")||result.contains("密级")) {
-								String secretLevel=result.replace("密1","").replace("密级","");
+								String secretLevel=result.replace("密1","").replace("密级","").replaceAll("_","");
 								auto.setSecretLevel(secretLevel);
 							} else {
-								fileName.append(result);
+								if(StringUtils.isNotEmpty(result)){
+									fileName.append(result);
+									System.out.println("文件题名识别结果:"+fileName);
+								}
 							}
 						}
-						auto.setName(fileName.toString());
+						System.out.println("文件题名最终结果:===================="+fileName);
+						if(StringUtils.isNotEmpty(fileName.toString())){
+							auto.setName(fileName.toString().replace("密级","").replace("级密",""));
+						}
 					}
 				}
 			}catch (Exception e){
-
+				e.printStackTrace();
 			}finally {
 				FileUtils.removeFile(filePath);
 				String updateSql="update u_archives_auto set colour_status=1 where id="+auto.getId();
@@ -5452,105 +5468,65 @@ public class ArchivesAutoServiceImpl extends BaseServiceImpl<ArchivesAutoMapper,
 		this.updateBatchById(archivesAutoList);
 		return true;
 	}
-
-	public static LocalDateTime[] convertDateRange(String dateRange,String split) {
-		String[] dates = dateRange.split(split);
-
-		DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyyMMdd");
-
-		LocalDate startLocalDate = LocalDate.parse(dates[0], formatter);
-		LocalDate endLocalDate = LocalDate.parse(dates[1], formatter);
-
-		LocalDateTime startDateTime = startLocalDate.atStartOfDay(); // 00:00:00
-		LocalDateTime endDateTime = endLocalDate.atStartOfDay();; // 00:00:00
-
-		return new LocalDateTime[]{startDateTime, endDateTime};
-	}
-	// 自定义实体类(与 JSON 结构匹配)
-	static class OcrResponse {
-		private String status;
-		private String message;
-		private List<String> lines;
-
-		public String getStatus() { return status; }
-		public void setStatus(String status) { this.status = status; }
-		public String getMessage() { return message; }
-		public void setMessage(String message) { this.message = message; }
-		public List<String> getLines() { return lines; }
-		public void setLines(List<String> lines) { this.lines = lines; }
-	}
-
 	public List<String> extractTextFromPDF(String pdfFilePath) throws IOException, InterruptedException {
-		// 1. 配置路径(确保正确)
-		String PYTHON_SCRIPT_PATH = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\Python\\pdf.py";
-		String PYTHON_INTERPRETER = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\python.exe";
+		//String PYTHON_SCRIPT_PATH = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\Python\\pdfTextExtractorWindows.py";
+		//String PYTHON_INTERPRETER = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\python.exe";
+		System.out.println("进入识别1");
+		String PYTHON_SCRIPT_PATH = "/www/wwwlogs/python/pdfTextExtractorWindows.py";
+		String PYTHON_INTERPRETER = "python3";
+		String[] command = {
+				PYTHON_INTERPRETER,
+				PYTHON_SCRIPT_PATH,
+				pdfFilePath
+		};
 
-		// 2. 构建命令
-		String[] command = {PYTHON_INTERPRETER, PYTHON_SCRIPT_PATH, pdfFilePath};
-
-		// 3. 执行进程(分离输出流)
 		Process process = new ProcessBuilder(command)
-				.redirectErrorStream(false)
+				.redirectErrorStream(true)
 				.start();
+		System.out.println("进入识别2");
+		// 读取Python输出
+		StringBuilder output = new StringBuilder();
+		try (InputStream inputStream = process.getInputStream();
+			 BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"))) {
 
-		// 4. 读取 JSON 结果(stdout)
-		StringBuilder jsonOutput = new StringBuilder();
-		try (BufferedReader reader = new BufferedReader(
-				new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8))) {
 			String line;
 			while ((line = reader.readLine()) != null) {
-				jsonOutput.append(line.trim()); // 去除多余空格/换行
+				output.append(line);
 			}
 		}
-
-		// 5. 读取错误日志(stderr,用于调试)
-		StringBuilder errorOutput = new StringBuilder();
-		try (BufferedReader errorReader = new BufferedReader(
-				new InputStreamReader(process.getErrorStream(), StandardCharsets.UTF_8))) {
-			String line;
-			while ((line = errorReader.readLine()) != null) {
-				errorOutput.append(line).append("\n");
-			}
-		}
-
-		// 6. 检查进程退出码
+		System.out.println("进入识别3");
 		int exitCode = process.waitFor();
 		if (exitCode != 0) {
-			throw new RuntimeException(
-					"Python脚本执行失败(退出码:" + exitCode + ")\n" +
-							"错误日志:" + errorOutput.toString() + "\n" +
-							"标准输出:" + jsonOutput.toString()
-			);
-		}
-
-		// 7. 解析 JSON(核心修复:用实体类直接解析)
-		String jsonStr = jsonOutput.toString();
-		if (jsonStr.isEmpty()) {
-			throw new RuntimeException("Python脚本未输出任何结果(JSON 为空)");
-		}
-
-		// 提取纯 JSON 字符串(过滤可能的前缀日志)
-		int jsonStart = jsonStr.indexOf('{');
-		int jsonEnd = jsonStr.lastIndexOf('}');
-		if (jsonStart == -1 || jsonEnd == -1) {
-			throw new RuntimeException(
-					"无法提取JSON结果,原始输出:" + jsonStr + "\n" +
-							"Python错误日志:" + errorOutput.toString()
-			);
-		}
-		jsonStr = jsonStr.substring(jsonStart, jsonEnd + 1);
-
-		// 直接解析为实体类,避免 Map 转换问题
+			System.out.println("进入识别4");
+			throw new RuntimeException("Python脚本执行失败,退出码: " + exitCode + ", 输出: " + output.toString());
+		}
+
+		// -------------------------- 关键修改:提取纯JSON部分 --------------------------
+		String rawOutput = output.toString();
+		// 找到JSON的起始位置(第一个'{')和结束位置(最后一个'}')
+		int jsonStart = rawOutput.indexOf('{');
+		int jsonEnd = rawOutput.lastIndexOf('}');
+		System.out.println("进入识别5");
+		if (jsonStart == -1 || jsonEnd == -1 || jsonStart >= jsonEnd) {
+			System.out.println("进入识别6");
+			throw new RuntimeException("无法提取有效的JSON结果,原始输出: " + rawOutput);
+		}
+		// 截取纯JSON字符串
+		String jsonStr = rawOutput.substring(jsonStart, jsonEnd + 1);
+		System.out.println("进入识别7");
+		// 解析清理后的JSON
 		Gson gson = new Gson();
-		OcrResponse ocrResponse = gson.fromJson(jsonStr, OcrResponse.class);
+		Type type = new TypeToken<Map<String, Object>>(){}.getType();
+		Map<String, Object> resultMap = gson.fromJson(jsonStr, type);
 
-		// 8. 校验结果状态
-		if (!"success".equals(ocrResponse.getStatus())) {
-			throw new RuntimeException("PDF处理失败: " + ocrResponse.getMessage() + "\nPython日志:" + errorOutput.toString());
+		if (!"success".equals(resultMap.get("status"))) {
+			System.out.println("进入识别8");
+			String message = (String) resultMap.get("message");
+			throw new RuntimeException("处理PDF失败: " + (message != null ? message : "未知错误"));
 		}
-
-		// 9. 返回 List<String>(直接从实体类获取)
-		return ocrResponse.getLines();
+		System.out.println("进入识别9");
+		Type listType = new TypeToken<List<String>>(){}.getType();
+		return gson.fromJson(gson.toJson(resultMap.get("lines")), listType);
 	}
 
 	@Scheduled(fixedDelay = 1000 * 60 * 10)