176 lines
5.2 KiB
Python
176 lines
5.2 KiB
Python
|
#gui lib
|
||
|
from tkinter import *
|
||
|
from tkinter import ttk
|
||
|
from tkinter.filedialog import askopenfilename
|
||
|
from tkinter import scrolledtext
|
||
|
# Import libraries
|
||
|
from PIL import Image
|
||
|
import pytesseract
|
||
|
import sys
|
||
|
from pdf2image import convert_from_path
|
||
|
import os
|
||
|
import shutil
|
||
|
import threading
|
||
|
|
||
|
|
||
|
|
||
|
def start_convert_file(file,page_start=1,page_end=None,language='kor'):
|
||
|
my_print("********* convert start ! *********")
|
||
|
images_path = os.getcwd() + "\\images"
|
||
|
output_path = os.path.dirname(file) + "\\output"
|
||
|
if os.path.exists(images_path):
|
||
|
shutil.rmtree(images_path)
|
||
|
os.makedirs(images_path)
|
||
|
|
||
|
if os.path.exists(output_path):
|
||
|
shutil.rmtree(output_path)
|
||
|
os.makedirs(output_path)
|
||
|
if file.endswith(".pdf"):
|
||
|
pages = convert_from_path(file,output_folder=images_path,thread_count=8,first_page=page_start,last_page=page_end)
|
||
|
pages_num = len(pages)
|
||
|
if page_end is None:
|
||
|
page_end = page_start + pages_num - 1
|
||
|
page_stop.set(page_end)
|
||
|
my_print("all %d pages, from %d to %d" % (pages_num,page_start,page_end))
|
||
|
pb["maximum"] = pages_num
|
||
|
pb["value"] = 0
|
||
|
|
||
|
page_num = page_start
|
||
|
i = 0
|
||
|
for page in pages:
|
||
|
if page_num not in range(page_start,page_end+1):
|
||
|
page_num = page_num + 1
|
||
|
continue
|
||
|
my_print("convert page %d..." % page_num)
|
||
|
txt_filename = output_path + "\\page_"+str(page_num)+".txt"
|
||
|
jpg_filename = output_path + "\\page_"+str(page_num)+".jpg"
|
||
|
page.save(jpg_filename, 'JPEG')
|
||
|
try:
|
||
|
text = str(((pytesseract.image_to_string(page,lang=language))))
|
||
|
except:
|
||
|
my_print("[error] convert page %d failed!" % page_num)
|
||
|
else:
|
||
|
f = open(txt_filename, "a",encoding='utf-8')
|
||
|
f.write(text)
|
||
|
f.close()
|
||
|
my_print("convert page %d success!" % page_num)
|
||
|
page_num = page_num + 1
|
||
|
i = i + 1
|
||
|
pb["value"] = i
|
||
|
my_print("output files in %s"%output_path)
|
||
|
else:
|
||
|
image_file = Image.open(file)
|
||
|
my_print("convert jpg file %s..." % file)
|
||
|
try:
|
||
|
text = str(((pytesseract.image_to_string(image_file,lang=language))))
|
||
|
except:
|
||
|
my_print("[error] convert jpg file %s failed !" % file)
|
||
|
else:
|
||
|
f = open(os.path.splitext(file)[0] + ".txt", "a",encoding='utf-8')
|
||
|
f.write(text)
|
||
|
f.close()
|
||
|
my_print("convert jpg file %s success !" % file)
|
||
|
|
||
|
bt['state'] = NORMAL
|
||
|
shutil.rmtree(images_path)
|
||
|
my_print("**************** convert done ! ****************")
|
||
|
my_print("************* thanks for using ^_^ *************\n")
|
||
|
|
||
|
root = Tk()
|
||
|
cb = ttk.Combobox(root)
|
||
|
pb = ttk.Progressbar(root,length=400)
|
||
|
t = scrolledtext.ScrolledText(root)
|
||
|
|
||
|
|
||
|
file_name = StringVar()
|
||
|
page_start = IntVar()
|
||
|
page_stop = IntVar()
|
||
|
language = ["kor","eng","chi_sim"]
|
||
|
|
||
|
def open_file():
|
||
|
_file_name = askopenfilename(filetypes = (("PDF Files","*.pdf"),("JPG Files","*.jpg")))
|
||
|
file_name.set(_file_name)
|
||
|
|
||
|
|
||
|
def start_click():
|
||
|
bt['state'] = DISABLED
|
||
|
__file = file_name.get()
|
||
|
if __file == "":
|
||
|
my_print("[error] please selece a pdf file!!!!!")
|
||
|
return
|
||
|
__page_start = page_start.get()
|
||
|
if __page_start == 0:
|
||
|
__page_start = 1
|
||
|
page_start.set(__page_start)
|
||
|
__page_stop = page_stop.get()
|
||
|
if __page_stop == 0:
|
||
|
__page_stop = None
|
||
|
elif __page_stop < __page_start:
|
||
|
__page_stop = __page_start
|
||
|
page_stop.set(__page_stop)
|
||
|
__language = language[cb.current()%3]
|
||
|
th=threading.Thread(target=start_convert_file,args=(__file,__page_start,__page_stop,__language),daemon=True)
|
||
|
th.start()
|
||
|
|
||
|
bt = Button(root,text="start",width=10,command =start_click)
|
||
|
|
||
|
def my_print(str):
|
||
|
t.insert(END,str + "\n")
|
||
|
t.see(END)
|
||
|
|
||
|
def start_gui():
|
||
|
root.title("OCR TOOL")
|
||
|
Label(root,text="pdf file:").grid(row=1,sticky=W)
|
||
|
Entry(root,textvariable = file_name,width=55).grid(row=1,column=1,columnspan=3,sticky=W)
|
||
|
|
||
|
Button(root,text="open",width=10,command = open_file).grid(row=1,column=4)
|
||
|
Label(root,text="language:").grid(row=2,sticky=W)
|
||
|
|
||
|
cb.grid(row=2,column=1,sticky=W)
|
||
|
cb["values"] = ("Korean", "English", "Chinese")
|
||
|
cb.current(0)
|
||
|
|
||
|
Label(root,text="page range:").grid(row=3,sticky=W)
|
||
|
|
||
|
Entry(root,textvariable = page_start).grid(row=3,column=1,sticky=W)
|
||
|
Label(root,text="~").grid(row=3,column=2)
|
||
|
|
||
|
Entry(root,textvariable = page_stop).grid(row=3,column=3,sticky=W)
|
||
|
|
||
|
Label(root,text="").grid(row=4)
|
||
|
bt.grid(row=5,column=2)
|
||
|
|
||
|
Label(root,text="").grid(row=6)
|
||
|
|
||
|
pb.grid(row=7,columnspan=5)
|
||
|
pb["maximum"] = 100
|
||
|
pb["value"] = 0
|
||
|
Label(root,text="").grid(row=8)
|
||
|
|
||
|
t.grid(row=9,column=0,columnspan=5)
|
||
|
|
||
|
root.mainloop()
|
||
|
|
||
|
|
||
|
|
||
|
class myStdout():
|
||
|
def __init__(self):
|
||
|
self.stdoutbak = sys.stdout
|
||
|
self.stderrbak = sys.stderr
|
||
|
sys.stdout = self
|
||
|
sys.stderr = self
|
||
|
|
||
|
def write(self, info):
|
||
|
t.insert('end', info)
|
||
|
t.update()
|
||
|
t.see(END)
|
||
|
|
||
|
def restoreStd(self):
|
||
|
sys.stdout = self.stdoutbak
|
||
|
sys.stderr = self.stderrbak
|
||
|
|
||
|
|
||
|
if __name__=="__main__":
|
||
|
mystd = myStdout()
|
||
|
start_gui()
|
||
|
|