Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Dejiao Zeng
parser-server
Commits
16ac9131
Commit
16ac9131
authored
Apr 25, 2023
by
Dejiao Zeng
Browse files
更新
parent
0443cdd7
Changes
8
Hide whitespace changes
Inline
Side-by-side
README.md
0 → 100644
View file @
16ac9131
# 记录一下脑仁疼的ocr.py环境安装方法
## ubuntu16.04下先安装下边的驱动
```
apt-get update
apt-get install lzma liblzma-dev libbz2-dev libssl-dev libffi-dev
sudo apt-get install curl
sudo add-apt-repository ppa:ubuntu-toolchain-r/test
sudo apt-get update
sudo apt-get install gcc-4.9
sudo apt-get upgrade libstdc++6
```
## Python源码安装
```
先修改/home/mia/Downloads/Python-3.8.16/Modules/Setup启用ssl
SSL=/usr
_ssl _ssl.c \
-DUSE_SSL -I$(SSL)/include -I$(SSL)/include/openssl \
-L$(SSL)/lib -lssl -lcrypto
./configure --enable-optimizations
make
sudo make install
```
## 创建环境
```
/usr/local/bin/python3 -m venv ocr
```
## 安装百度飞桨
```
python -m pip install paddlehub -i https://pypi.tuna.tsinghua.edu.cn/simple
python -m pip install paddlepaddle -i https://pypi.tuna.tsinghua.edu.cn/simple
```
## pom.xml中的cermine-impl可以在maven下载
## pom.xml中的common不需要可以删了
\ No newline at end of file
src/main/java/com/csu/fileserver/controller/ExtractorController.java
View file @
16ac9131
...
...
@@ -8,6 +8,7 @@ import com.csu.common.execption.BadRequestException;
import
com.csu.fileserver.cermine.ArticleMeta
;
import
com.csu.fileserver.pdfextractor.CermineExtractorService
;
import
com.csu.fileserver.pdfextractor.ExtractionResult
;
import
com.csu.fileserver.utils.FileDownloader
;
import
com.csu.fileserver.vo.DynamicFormVO
;
import
com.csu.fileserver.vo.ExtractParam
;
import
org.apache.commons.io.FileUtils
;
...
...
@@ -18,6 +19,7 @@ import org.apache.pdfbox.text.PDFTextStripper;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
org.springframework.beans.factory.annotation.Autowired
;
import
org.springframework.beans.factory.annotation.Value
;
import
org.springframework.core.io.InputStreamResource
;
import
org.springframework.core.io.Resource
;
import
org.springframework.core.io.UrlResource
;
...
...
@@ -28,11 +30,13 @@ import java.io.FileInputStream;
import
java.io.IOException
;
import
java.io.InputStream
;
import
java.nio.charset.StandardCharsets
;
import
java.nio.file.Files
;
import
java.nio.file.Path
;
import
java.nio.file.Paths
;
import
java.util.ArrayList
;
import
java.util.Collections
;
import
java.util.List
;
import
java.util.UUID
;
import
java.util.regex.Matcher
;
import
java.util.regex.Pattern
;
...
...
@@ -42,10 +46,12 @@ import java.util.regex.Pattern;
public
class
ExtractorController
{
private
Logger
log
=
LoggerFactory
.
getLogger
(
this
.
getClass
());
private
final
CermineExtractorService
cermineExtractorService
;
private
final
String
tempfile
;
@Autowired
public
ExtractorController
(
CermineExtractorService
cermineExtractorService
)
{
public
ExtractorController
(
CermineExtractorService
cermineExtractorService
,
@Value
(
"${file-server.tempfile}"
)
String
tempfile
)
{
this
.
cermineExtractorService
=
cermineExtractorService
;
this
.
tempfile
=
tempfile
;
}
List
<
String
>
matchFirstPage
(
PDDocument
doc
,
List
<
String
>
target
)
throws
IOException
{
...
...
@@ -105,6 +111,11 @@ public class ExtractorController {
String
type
=
param
.
getType
();
String
wjdz
=
param
.
getWjdz
();
List
<
String
>
matched
=
Collections
.
emptyList
();
//singletonList("61672536")
if
(
wjdz
.
startsWith
(
"http"
)){
String
tempfilewithuuid
=
tempfile
+
"/"
+
UUID
.
randomUUID
()
+
".pdf"
;
FileDownloader
.
downloadFile
(
wjdz
,
tempfilewithuuid
);
wjdz
=
tempfilewithuuid
;
}
log
.
info
(
"parse start"
);
if
(
"CG_QK"
.
equals
(
type
)){
// FileInputStream ins = new FileInputStream(wjdz);
...
...
@@ -123,6 +134,7 @@ public class ExtractorController {
}
catch
(
IOException
ioe
)
{
ioe
.
printStackTrace
();
}
Files
.
delete
(
Paths
.
get
(
wjdz
));
return
ResultHandler
.
ok
(
res
);
}
else
if
(
"CG_HY"
.
equals
(
type
)){
// FileInputStream ins = new FileInputStream(wjdz);
...
...
@@ -141,13 +153,14 @@ public class ExtractorController {
}
catch
(
IOException
ioe
)
{
ioe
.
printStackTrace
();
}
Files
.
delete
(
Paths
.
get
(
wjdz
));
return
ResultHandler
.
ok
(
res
);
}
else
if
(
"CG_ZZQ"
.
equals
(
type
)
||
"CG_ZL"
.
equals
(
type
)){
Path
temp
=
Paths
.
get
(
wjdz
);
if
(
"CG_ZZQ"
.
equals
(
type
))
type
=
"software"
;
else
type
=
"patent"
;
System
.
out
.
println
(
temp
.
toAbsolutePath
().
toString
());
Process
p
=
Runtime
.
getRuntime
().
exec
(
new
String
[]{
"python"
,
".
\\
script
\\
ocr.py"
,
temp
.
toAbsolutePath
().
toString
(),
type
});
Process
p
=
Runtime
.
getRuntime
().
exec
(
new
String
[]{
"python"
,
".
/
script
/
ocr.py"
,
temp
.
toAbsolutePath
().
toString
(),
type
});
p
.
waitFor
();
System
.
out
.
println
(
IOUtils
.
toString
(
p
.
getInputStream
(),
StandardCharsets
.
UTF_8
));
System
.
err
.
println
(
IOUtils
.
toString
(
p
.
getErrorStream
(),
StandardCharsets
.
UTF_8
));
...
...
@@ -161,9 +174,11 @@ public class ExtractorController {
e
.
printStackTrace
();
}
result
.
delete
();
Files
.
delete
(
Paths
.
get
(
wjdz
));
return
ResultHandler
.
ok
(
json
);
}
log
.
info
(
"parse end"
);
Files
.
delete
(
Paths
.
get
(
wjdz
));
return
ResultHandler
.
ok
(
Collections
.
emptyList
());
}
...
...
src/main/java/com/csu/fileserver/utils/FileDownloader.java
0 → 100644
View file @
16ac9131
package
com.csu.fileserver.utils
;
import
java.io.FileOutputStream
;
import
java.io.IOException
;
import
java.io.InputStream
;
import
java.net.URL
;
import
java.net.URLConnection
;
public
class
FileDownloader
{
public
static
void
downloadFile
(
String
fileUrl
,
String
saveFilePath
)
throws
IOException
{
URL
url
=
new
URL
(
fileUrl
);
URLConnection
connection
=
url
.
openConnection
();
InputStream
inputStream
=
connection
.
getInputStream
();
FileOutputStream
outputStream
=
new
FileOutputStream
(
saveFilePath
);
byte
[]
buffer
=
new
byte
[
4096
];
int
bytesRead
=
-
1
;
while
((
bytesRead
=
inputStream
.
read
(
buffer
))
!=
-
1
)
{
outputStream
.
write
(
buffer
,
0
,
bytesRead
);
}
outputStream
.
close
();
inputStream
.
close
();
}
}
src/main/resources/application-dev.yml
View file @
16ac9131
...
...
@@ -32,6 +32,7 @@ spring:
show-sql
:
true
file-server
:
tempfile
:
/home/mia/parser_server/temp
root
:
../dist
max-size
:
1GB
max-chunk-size
:
10MB
\ No newline at end of file
src/main/resources/application-test.yml
View file @
16ac9131
...
...
@@ -32,6 +32,7 @@ spring:
show-sql
:
true
file-server
:
tempfile
:
/home/mia/parser_server/temp
root
:
../dist
max-size
:
1GB
max-chunk-size
:
10MB
\ No newline at end of file
src/main/resources/application.yml
View file @
16ac9131
...
...
@@ -32,6 +32,7 @@ spring:
show-sql
:
true
file-server
:
tempfile
:
/home/mia/parser_server/temp
root
:
../dist
max-size
:
1GB
max-chunk-size
:
10MB
\ No newline at end of file
相关资源/Python-3.8.16.tgz
0 → 100644
View file @
16ac9131
File added
相关资源/script.rar
0 → 100644
View file @
16ac9131
File added
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment