📜  使用python代码示例从pdf识别阿拉伯文文本

📅  最后修改于: 2022-03-11 14:45:06.408000             🧑  作者: Mango

代码示例1
import os
from os import chdir, getcwd, listdir, path
import codecs
import pyPdf
from time import strftime

def check_path(prompt):
    ''' (str) -> str
    Verifies if the provided absolute path does exist.
    '''
    abs_path = raw_input(prompt)
    while path.exists(abs_path) != True:
        print "\nThe specified path does not exist.\n"
        abs_path = raw_input(prompt)
    return abs_path    

print "\n"

folder = check_path("Provide absolute path for the folder: ")

list=[]
directory=folder
for root,dirs,files in os.walk(directory):
    for filename in files:
        if filename.endswith('.pdf'):
            t=os.path.join(directory,filename)

            list.append(t)

m=len(list)
print (m)
i=0
while i<=m-1:

    path=list[i]
    print(path)
    head,tail=os.path.split(path)
    var="\\"

    tail=tail.replace(".pdf",".txt")
    name=head+var+tail

    content = ""
    # Load PDF into pyPDF
    pdf = pyPdf.PdfFileReader(file(path, "rb"))
            # Iterate pages
    for j in range(0, pdf.getNumPages()):
        # Extract text from page and add to content
        content += pdf.getPage(j).extractText() + "\n"
    print strftime("%H:%M:%S"), " pdf  -> txt "
    f=open(name,'w')
    content.encode('utf-8')
    f.write(content)
    f.close
    i=i+1