Commit 16ac9131 authored by Dejiao Zeng's avatar Dejiao Zeng
Browse files

更新

parent 0443cdd7
# 记录一下脑仁疼的ocr.py环境安装方法
## ubuntu16.04下先安装下边的驱动
```
apt-get update
apt-get install lzma liblzma-dev libbz2-dev libssl-dev libffi-dev
sudo apt-get install curl
sudo add-apt-repository ppa:ubuntu-toolchain-r/test
sudo apt-get update
sudo apt-get install gcc-4.9
sudo apt-get upgrade libstdc++6
```
## Python源码安装
```
先修改/home/mia/Downloads/Python-3.8.16/Modules/Setup启用ssl
SSL=/usr
_ssl _ssl.c \
-DUSE_SSL -I$(SSL)/include -I$(SSL)/include/openssl \
-L$(SSL)/lib -lssl -lcrypto
./configure --enable-optimizations
make
sudo make install
```
## 创建环境
```
/usr/local/bin/python3 -m venv ocr
```
## 安装百度飞桨
```
python -m pip install paddlehub -i https://pypi.tuna.tsinghua.edu.cn/simple
python -m pip install paddlepaddle -i https://pypi.tuna.tsinghua.edu.cn/simple
```
## pom.xml中的cermine-impl可以在maven下载
## pom.xml中的common不需要可以删了
\ No newline at end of file
......@@ -8,6 +8,7 @@ import com.csu.common.execption.BadRequestException;
import com.csu.fileserver.cermine.ArticleMeta;
import com.csu.fileserver.pdfextractor.CermineExtractorService;
import com.csu.fileserver.pdfextractor.ExtractionResult;
import com.csu.fileserver.utils.FileDownloader;
import com.csu.fileserver.vo.DynamicFormVO;
import com.csu.fileserver.vo.ExtractParam;
import org.apache.commons.io.FileUtils;
......@@ -18,6 +19,7 @@ import org.apache.pdfbox.text.PDFTextStripper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.InputStreamResource;
import org.springframework.core.io.Resource;
import org.springframework.core.io.UrlResource;
......@@ -28,11 +30,13 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
......@@ -42,10 +46,12 @@ import java.util.regex.Pattern;
public class ExtractorController {
private Logger log = LoggerFactory.getLogger(this.getClass());
private final CermineExtractorService cermineExtractorService;
private final String tempfile;
@Autowired
public ExtractorController(CermineExtractorService cermineExtractorService) {
public ExtractorController(CermineExtractorService cermineExtractorService, @Value("${file-server.tempfile}") String tempfile) {
this.cermineExtractorService = cermineExtractorService;
this.tempfile = tempfile;
}
List<String> matchFirstPage(PDDocument doc, List<String> target) throws IOException{
......@@ -105,6 +111,11 @@ public class ExtractorController {
String type = param.getType();
String wjdz = param.getWjdz();
List<String> matched = Collections.emptyList();//singletonList("61672536")
if(wjdz.startsWith("http")){
String tempfilewithuuid = tempfile + "/" + UUID.randomUUID() + ".pdf";
FileDownloader.downloadFile(wjdz,tempfilewithuuid);
wjdz = tempfilewithuuid;
}
log.info("parse start");
if ("CG_QK".equals(type)){
// FileInputStream ins = new FileInputStream(wjdz);
......@@ -123,6 +134,7 @@ public class ExtractorController {
} catch (IOException ioe) {
ioe.printStackTrace();
}
Files.delete(Paths.get(wjdz));
return ResultHandler.ok(res);
}else if ("CG_HY".equals(type)){
// FileInputStream ins = new FileInputStream(wjdz);
......@@ -141,13 +153,14 @@ public class ExtractorController {
} catch (IOException ioe) {
ioe.printStackTrace();
}
Files.delete(Paths.get(wjdz));
return ResultHandler.ok(res);
}else if ("CG_ZZQ".equals(type) || "CG_ZL".equals(type)){
Path temp = Paths.get(wjdz);
if ("CG_ZZQ".equals(type)) type = "software";
else type = "patent";
System.out.println(temp.toAbsolutePath().toString());
Process p = Runtime.getRuntime().exec(new String[]{"python",".\\script\\ocr.py",temp.toAbsolutePath().toString(),type});
Process p = Runtime.getRuntime().exec(new String[]{"python","./script/ocr.py",temp.toAbsolutePath().toString(),type});
p.waitFor();
System.out.println(IOUtils.toString(p.getInputStream(), StandardCharsets.UTF_8));
System.err.println(IOUtils.toString(p.getErrorStream(), StandardCharsets.UTF_8));
......@@ -161,9 +174,11 @@ public class ExtractorController {
e.printStackTrace();
}
result.delete();
Files.delete(Paths.get(wjdz));
return ResultHandler.ok(json);
}
log.info("parse end");
Files.delete(Paths.get(wjdz));
return ResultHandler.ok(Collections.emptyList());
}
......
package com.csu.fileserver.utils;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
public class FileDownloader {
public static void downloadFile(String fileUrl, String saveFilePath) throws IOException {
URL url = new URL(fileUrl);
URLConnection connection = url.openConnection();
InputStream inputStream = connection.getInputStream();
FileOutputStream outputStream = new FileOutputStream(saveFilePath);
byte[] buffer = new byte[4096];
int bytesRead = -1;
while ((bytesRead = inputStream.read(buffer)) != -1) {
outputStream.write(buffer, 0, bytesRead);
}
outputStream.close();
inputStream.close();
}
}
......@@ -32,6 +32,7 @@ spring:
show-sql: true
file-server:
tempfile: /home/mia/parser_server/temp
root: ../dist
max-size: 1GB
max-chunk-size: 10MB
\ No newline at end of file
......@@ -32,6 +32,7 @@ spring:
show-sql: true
file-server:
tempfile: /home/mia/parser_server/temp
root: ../dist
max-size: 1GB
max-chunk-size: 10MB
\ No newline at end of file
......@@ -32,6 +32,7 @@ spring:
show-sql: true
file-server:
tempfile: /home/mia/parser_server/temp
root: ../dist
max-size: 1GB
max-chunk-size: 10MB
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment