postprocessing.py 3.6 KB
Newer Older
xuchen committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
import argparse
import re


def read_file(input_path):
    sentences = []
    with open(input_path, 'r', encoding='utf8') as f:
        for line in f.readlines():
            sen_temp = line.strip()
            sentences.append(sen_temp)
    return sentences


def write_file(output_path, sentences):
    with open(output_path, 'w', encoding='utf8') as f:
        for line in sentences:
            f.write(line + '\n')
    return


def remove_tag(sentences):
    ### 去掉(Applaus)等带括号的tag ###
    sen_new = []
    for line in sentences:
        sen_temp = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]", "", line)
        sen_new.append(sen_temp)
    return sen_new


def remove_beginning_punctuation(sentences):
    #### 去掉开头的逗号等符号 ###
    remove_punctuation = [',', '.', '?', ':', '-', ' ']
    sen_new = []
    for line in sentences:
        if len(line) > 0 and line[0] in remove_punctuation:
            sen_temp = line
            for p in remove_punctuation:
                sen_temp = sen_temp.lstrip(p)
            sen_new.append(sen_temp)
        else:
            sen_new.append(line)
    return sen_new


def remove_ending_punctuation(sentences):
    ### 去掉句尾的逗号等符号 ###
    remove_punctuation = [',', ':', ' ']
    sen_new = []
    for line in sentences:
        # print(line[-1])
        if len(line) > 0 and line[-1] in remove_punctuation:
            sen_temp = line
            for p in remove_punctuation:
                sen_temp = sen_temp.rstrip(p)
            sen_new.append(sen_temp)
        else:
            sen_new.append(line)
    return sen_new


def remove_space(sentences):
    ### 去掉首尾的空格,以及连续的空格 ###
    sen_new = []
    for line in sentences:
        sen_temp = line.strip()
        sen_temp = ' '.join(sen_temp.split())
        sen_new.append(sen_temp)
    return sen_new


def remove_special_tag(sentences):
    ### 去掉双破折号 -- ,可选 ###
    sen_new = []
    for line in sentences:
        sen_temp = line.replace('--', '—')
        sen_new.append(sen_temp)
    return sen_new


def first_letter_upper(sentences):
    ### 将首字母大写 ###
    sen_new = []
    for line in sentences:
        if len(line) > 0 and line[0].isalpha() and line[0].islower():
            l = list(line)
            l[0] = l[0].upper()
            sen_temp = ''.join(l)
            sen_new.append(sen_temp)
            continue
        else:
            sen_new.append(line)
    return sen_new


def add_last_punctuation(sentences):
    ### 给末尾没有标点的句子加句号 . ###
    sen_new = []
    for line in sentences:
        if len(line) > 0 and line[-1].isalpha():
            sen_temp = line + '.'
            sen_new.append(sen_temp)
        else:
            sen_new.append(line)
    return sen_new


def process(args):
    input_path = args.input_absolute_path
    output_path = args.output_absolute_path
    sentences = read_file(input_path)
    sentences = remove_tag(sentences)
    # sentences = remove_beginning_punctuation(sentences)
    # sentences = remove_ending_punctuation(sentences)
    sentences = remove_special_tag(sentences)
    # sentences = remove_space(sentences)
    # sentences = first_letter_upper(sentences)
    # sentences = add_last_punctuation(sentences)
    write_file(output_path, sentences)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_absolute_path", required=True, type=str)  # 输入文件绝对路径
    parser.add_argument("--output_absolute_path", required=True, type=str)  # 输出文件绝对路径
    args = parser.parse_args()

    process(args)


if __name__ == '__main__':
    main()