tok.py 844 Bytes
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
#!/usr/bin/env python3
#
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import sys

import sacremoses


def main(args):
    """Tokenizes, preserving tabs"""
    mt = sacremoses.MosesTokenizer(lang=args.lang)

    def tok(s):
        return mt.tokenize(s, return_str=True)

    for line in sys.stdin:
        parts = list(map(tok, line.split("\t")))
        print(*parts, sep="\t", flush=True)


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--lang", "-l", default="en")
    parser.add_argument("--penn", "-p", action="store_true")
    parser.add_argument("--fields", "-f", help="fields to tokenize")
    args = parser.parse_args()

    main(args)