본문 바로가기

Web_Application/python

python 텍스트 한문을 한글로 변환하기



https://github.com/suminb/hanja

python 3.x에서 설치하기

1. Download Zip 다운로드 후 압축 풀기
2. 관리자 권한으로 명령프롬프트 실행
3. setup.py 가 있는 폴더로 이동
- cd C:\source\test\pylib\hanja-develop\hanja-develop

pip install . 실행

의존성 오류가 발생하면 패키지 설치하기

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# -*- coding: utf-8 -*-
import pandas as pd
import os
import hanja
from hanja import hangul
from datetime import datetime
 
datetime = datetime.now().strftime("%Y%m%d%H%M00")
 
def hanja_hangul_trans(file_path):
    # print(file_path)
    print_file_path = ""
 
    is_hanja = False
 
    target_file = open(file_path, mode="r", encoding="utf8")
 
    try:
 
        lines = target_file.readlines()
        trans_item_array = []
 
        for line in lines:
            text_items = [x for x in hanja.split_hanja(line)]
 
            for text_item in text_items:
                items = text_item.strip()        
                is_hanja_flag = False
 
                for item in items:
                    is_hanja_flag = hanja.is_hanja(str(item))
 
                    if is_hanja == False:
                        break
 
                if is_hanja_flag:
                    if print_file_path == "":
                        print_file_path = file_path
                        print("{0}".format(file_path),file=save_file)
 
                    trans_item_text = "{0},{1}".format(items, hanja.translate(items, 'substitution'))
 
                    if trans_item_text not in trans_item_array:
                        trans_item_array.append(trans_item_text)
        
        for trans_item in trans_item_array:
            print("{0}".format(trans_item),file=save_file)
 
        target_file.close()
    except UnicodeDecodeError as ude:
        target_file = open(file_path, mode="r", encoding="cp949")       #encoding 오류가 발생하면 처리
    finally:
        pass
 
#개별 폴더
 
root_dir = r"{{ROOT_FOLDER}}"
 
save_file_path = r"{{저장파일}}-{0}.txt".format(datetime)
 
with open(save_file_path,"w+",encoding="utf8") as save_file:
    for (root, dirs, files) in os.walk(root_dir):
        if len(files) > 0:
            for file_name in files:
                filename, fileExtension = os.path.splitext(file_name)
 
                if fileExtension in (".java",".jsp"):
                    trans_file_path = "{0}\{1}".format(root,file_name)
 
                    hanja_hangul_trans(trans_file_path)
save_file.close()       
cs