Permalink
Please sign in to comment.
Browse files
Merge pull request #159 from evelinacs/irtg
Add scripts for sorting NPs
- Loading branch information...
Showing
with
109 additions
and 0 deletions.
- +14 −0 exp/alto/tools/filter_width.py
- +33 −0 exp/alto/tools/format_tree.py
- +15 −0 exp/alto/tools/line_filter.py
- +22 −0 exp/alto/tools/sort.sh
- +12 −0 exp/alto/tools/sort_nps_depth.py
- +13 −0 exp/alto/tools/sort_nps_width.py
| @@ -0,0 +1,14 @@ | |||
| #!/usr/bin/env python3 | |||
|
|
|||
| import sys | |||
| from nltk.tree import Tree | |||
|
|
|||
| def sort_nps(): | |||
| with open(sys.argv[1]) as np_doc: | |||
| for line in np_doc: | |||
| t = Tree.fromstring(line) | |||
| width = len(t) | |||
| if width <= 3: | |||
| print(line, end = "") | |||
|
|
|||
| sort_nps() | |||
| @@ -0,0 +1,33 @@ | |||
| #!/usr/bin/env python3 | |||
|
|
|||
| import sys | |||
| from nltk.tree import Tree | |||
|
|
|||
| #Converts Penn Treebank to Alto-compatible format | |||
|
|
|||
| def format_tree(): | |||
| with open(sys.argv[1]) as np_doc: | |||
| for line in np_doc: | |||
| np = Tree.fromstring(line) | |||
| print_tree(np) | |||
| print() | |||
|
|
|||
|
|
|||
| def print_tree(tree): | |||
| if tree.height() == 2: | |||
| print("{}( {})".format(tree.label(), tree[0]), end="") #pos, word; Stanford format | |||
| else: | |||
| tree_len = len(tree) | |||
| if tree.label() == "NP" and tree_len > 1: #NP2, NP3... | |||
| tree.set_label("NP{}".format(tree_len)) | |||
|
|
|||
| print("{}( ".format(tree.label()), end="") | |||
| index = 1 | |||
| for subtree in tree: | |||
| print_tree(subtree) | |||
| if tree_len > index: | |||
| print(", ", end="") | |||
| index += 1 | |||
| print(")", end="") | |||
|
|
|||
| format_tree() | |||
| @@ -0,0 +1,15 @@ | |||
| #!/usr/bin/env python3 | |||
|
|
|||
| import sys | |||
|
|
|||
| def line_filter(): | |||
| seen = set() | |||
| with open(sys.argv[1]) as np_doc: | |||
| for line in np_doc: | |||
| if line not in seen: | |||
| seen.add(line) | |||
| print(line, end="") | |||
|
|
|||
|
|
|||
|
|
|||
| line_filter() | |||
| @@ -0,0 +1,22 @@ | |||
| #!/bin/bash | |||
|
|
|||
| NPS="${1}" | |||
| TMP_ORDERED="/tmp/tmp_ordered" | |||
| TMP_ORDERED2="/tmp/tmp_ordered2" | |||
| TMP_ORDERED3="/tmp/tmp_ordered3" | |||
|
|
|||
|
|
|||
| # Sort input file in alphabetical order | |||
| cat "${NPS}" | sort > "${TMP_ORDERED}" | |||
| # width | |||
| python sort_nps_width.py "${TMP_ORDERED}" | sort -n -s | cut -d" " -f2- > "${TMP_ORDERED2}" | |||
| # depth | |||
| python sort_nps_depth.py "${TMP_ORDERED2}" | sort -n -s | cut -d" " -f2- > "${TMP_ORDERED3}" | |||
|
|
|||
| # Get rid of identical lines | |||
| python3 line_filter.py "${TMP_ORDERED3}" | |||
|
|
|||
| # Remove temporary files | |||
| rm "${TMP_ORDERED}" | |||
| rm "${TMP_ORDERED2}" | |||
| rm "${TMP_ORDERED3}" | |||
| @@ -0,0 +1,12 @@ | |||
| #!/usr/bin/env python3 | |||
|
|
|||
| import sys | |||
| from nltk.tree import Tree | |||
|
|
|||
| def sort_nps(): | |||
| with open(sys.argv[1]) as np_doc: | |||
| for line in np_doc: | |||
| np = Tree.fromstring(line) | |||
| print(np.height(), line, end ="") | |||
|
|
|||
| sort_nps() | |||
| @@ -0,0 +1,13 @@ | |||
| #!/usr/bin/env python3 | |||
|
|
|||
| import sys | |||
| from nltk.tree import Tree | |||
|
|
|||
| def sort_nps(): | |||
| with open(sys.argv[1]) as np_doc: | |||
| for line in np_doc: | |||
| t = Tree.fromstring(line) | |||
| width = len(t) | |||
| print(width, line, end = "") | |||
|
|
|||
| sort_nps() | |||
0 comments on commit
4c29314