|
@@ -19,6 +19,9 @@ package org.springblade.archive.service.impl;
|
|
|
import com.alibaba.fastjson.JSON;
|
|
|
import com.alibaba.fastjson.JSONArray;
|
|
|
import com.alibaba.fastjson.JSONObject;
|
|
|
+import com.google.gson.Gson;
|
|
|
+import com.google.gson.reflect.TypeToken;
|
|
|
+import java.lang.reflect.Type;
|
|
|
|
|
|
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
|
|
|
import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
|
|
@@ -96,7 +99,9 @@ import java.nio.file.Files;
|
|
|
import java.nio.file.Path;
|
|
|
import java.nio.file.Paths;
|
|
|
import java.rmi.ServerException;
|
|
|
+import java.time.LocalDate;
|
|
|
import java.time.LocalDateTime;
|
|
|
+import java.time.format.DateTimeFormatter;
|
|
|
import java.util.*;
|
|
|
import java.util.List;
|
|
|
import java.util.concurrent.ExecutorService;
|
|
@@ -5254,6 +5259,132 @@ public class ArchivesAutoServiceImpl extends BaseServiceImpl<ArchivesAutoMapper,
|
|
|
.mapToInt(file -> file.getFilePage() != null ? file.getFilePage() : 0)
|
|
|
.sum();
|
|
|
}
|
|
|
+
|
|
|
+ @Override
|
|
|
+ @Async
|
|
|
+ public boolean atuoOCR(String ids) throws Exception {
|
|
|
+ //String url="/mnt/sdc/AutoPdf/";
|
|
|
+ String url="D:\\AutoPdf\\";
|
|
|
+ List<Long> idsList = Func.toLongList(ids);
|
|
|
+ List<ArchivesAuto> archivesAutoList = this.list(new LambdaQueryWrapper<ArchivesAuto>().in(ArchivesAuto::getId, idsList));
|
|
|
+ for (ArchivesAuto auto : archivesAutoList) {
|
|
|
+ String fileUrl=auto.getOutUrl().substring(0,auto.getOutUrl().indexOf("@"));
|
|
|
+ String filePath=url+auto.getName()+".pdf";
|
|
|
+ Boolean b = FileUtils.saveInputStreamByUrl(fileUrl, filePath);
|
|
|
+ try {
|
|
|
+ if(b){
|
|
|
+ List<String> list = extractTextFromPDF(filePath);
|
|
|
+ if(!list.isEmpty()){
|
|
|
+ StringBuilder fileName=new StringBuilder();
|
|
|
+ for (String result : list) {
|
|
|
+ if(result.contains("档号")){
|
|
|
+ String fileNum=result.replace("档号","").replace(":","").replace(":","");
|
|
|
+ auto.setFileNumber(fileNum);
|
|
|
+ }else if(result.contains("立卷单位")){
|
|
|
+ String unit=result.replace("立卷单位","").replace(":","").replace(":","");
|
|
|
+ auto.setUnit(unit);
|
|
|
+ } else if (result.contains("起止日期")) {
|
|
|
+ String time=result.replace("起止日期","").replace(":","").replace(":","");
|
|
|
+ if(result.contains("~")){
|
|
|
+ LocalDateTime[] localDateTimes = convertDateRange(time, "~");
|
|
|
+ auto.setStartDate(localDateTimes[0]);
|
|
|
+ auto.setEndDate(localDateTimes[1]);
|
|
|
+ } else if (result.contains("-")) {
|
|
|
+ LocalDateTime[] localDateTimes = convertDateRange(time, "-");
|
|
|
+ auto.setStartDate(localDateTimes[0]);
|
|
|
+ auto.setEndDate(localDateTimes[1]);
|
|
|
+ }
|
|
|
+ } else if (result.contains("保管期限")||result.contains("保管限期")) {
|
|
|
+ String storageTime=result.replace("保管期限","").replace("保管限期","").replace(":","").replace(":","");
|
|
|
+ auto.setStorageTime(storageTime);
|
|
|
+ } else if (result.contains("密1")||result.contains("密级")) {
|
|
|
+ String secretLevel=result.replace("密1","").replace("密级","");
|
|
|
+ auto.setSecretLevel(secretLevel);
|
|
|
+ } else {
|
|
|
+ fileName.append(result);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ auto.setName(fileName.toString());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }catch (Exception e){
|
|
|
+
|
|
|
+ }finally {
|
|
|
+ FileUtils.removeFile(filePath);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ this.updateBatchById(archivesAutoList);
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+
|
|
|
+ public static LocalDateTime[] convertDateRange(String dateRange,String split) {
|
|
|
+ String[] dates = dateRange.split(split);
|
|
|
+
|
|
|
+ DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyyMMdd");
|
|
|
+
|
|
|
+ LocalDate startLocalDate = LocalDate.parse(dates[0], formatter);
|
|
|
+ LocalDate endLocalDate = LocalDate.parse(dates[1], formatter);
|
|
|
+
|
|
|
+ LocalDateTime startDateTime = startLocalDate.atStartOfDay(); // 00:00:00
|
|
|
+ LocalDateTime endDateTime = endLocalDate.atStartOfDay();; // 00:00:00
|
|
|
+
|
|
|
+ return new LocalDateTime[]{startDateTime, endDateTime};
|
|
|
+ }
|
|
|
+
|
|
|
+ public List<String> extractTextFromPDF(String pdfFilePath) throws IOException, InterruptedException {
|
|
|
+ String PYTHON_SCRIPT_PATH = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\Python\\pdfTextExtractorWindows.py";
|
|
|
+ String PYTHON_INTERPRETER = "C:\\Users\\hc01\\AppData\\Local\\Programs\\Python\\Python310\\python.exe";
|
|
|
+
|
|
|
+ String[] command = {
|
|
|
+ PYTHON_INTERPRETER,
|
|
|
+ PYTHON_SCRIPT_PATH,
|
|
|
+ pdfFilePath
|
|
|
+ };
|
|
|
+
|
|
|
+ Process process = new ProcessBuilder(command)
|
|
|
+ .redirectErrorStream(true)
|
|
|
+ .start();
|
|
|
+
|
|
|
+ // 读取Python输出
|
|
|
+ StringBuilder output = new StringBuilder();
|
|
|
+ try (InputStream inputStream = process.getInputStream();
|
|
|
+ BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"))) {
|
|
|
+
|
|
|
+ String line;
|
|
|
+ while ((line = reader.readLine()) != null) {
|
|
|
+ output.append(line);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ int exitCode = process.waitFor();
|
|
|
+ if (exitCode != 0) {
|
|
|
+ throw new RuntimeException("Python脚本执行失败,退出码: " + exitCode + ", 输出: " + output.toString());
|
|
|
+ }
|
|
|
+
|
|
|
+ // -------------------------- 关键修改:提取纯JSON部分 --------------------------
|
|
|
+ String rawOutput = output.toString();
|
|
|
+ // 找到JSON的起始位置(第一个'{')和结束位置(最后一个'}')
|
|
|
+ int jsonStart = rawOutput.indexOf('{');
|
|
|
+ int jsonEnd = rawOutput.lastIndexOf('}');
|
|
|
+ if (jsonStart == -1 || jsonEnd == -1 || jsonStart >= jsonEnd) {
|
|
|
+ throw new RuntimeException("无法提取有效的JSON结果,原始输出: " + rawOutput);
|
|
|
+ }
|
|
|
+ // 截取纯JSON字符串
|
|
|
+ String jsonStr = rawOutput.substring(jsonStart, jsonEnd + 1);
|
|
|
+
|
|
|
+ // 解析清理后的JSON
|
|
|
+ Gson gson = new Gson();
|
|
|
+ Type type = new TypeToken<Map<String, Object>>(){}.getType();
|
|
|
+ Map<String, Object> resultMap = gson.fromJson(jsonStr, type);
|
|
|
+
|
|
|
+ if (!"success".equals(resultMap.get("status"))) {
|
|
|
+ String message = (String) resultMap.get("message");
|
|
|
+ throw new RuntimeException("处理PDF失败: " + (message != null ? message : "未知错误"));
|
|
|
+ }
|
|
|
+
|
|
|
+ Type listType = new TypeToken<List<String>>(){}.getType();
|
|
|
+ return gson.fromJson(gson.toJson(resultMap.get("lines")), listType);
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
|