Permalink
Please sign in to comment.
Browse files
Merge pull request #159 from evelinacs/irtg
Add scripts for sorting NPs
- Loading branch information...
Showing
with
109 additions
and 0 deletions.
- +14 −0 exp/alto/tools/filter_width.py
- +33 −0 exp/alto/tools/format_tree.py
- +15 −0 exp/alto/tools/line_filter.py
- +22 −0 exp/alto/tools/sort.sh
- +12 −0 exp/alto/tools/sort_nps_depth.py
- +13 −0 exp/alto/tools/sort_nps_width.py
@@ -0,0 +1,14 @@ | |||
#!/usr/bin/env python3 | |||
|
|||
import sys | |||
from nltk.tree import Tree | |||
|
|||
def sort_nps(): | |||
with open(sys.argv[1]) as np_doc: | |||
for line in np_doc: | |||
t = Tree.fromstring(line) | |||
width = len(t) | |||
if width <= 3: | |||
print(line, end = "") | |||
|
|||
sort_nps() |
@@ -0,0 +1,33 @@ | |||
#!/usr/bin/env python3 | |||
|
|||
import sys | |||
from nltk.tree import Tree | |||
|
|||
#Converts Penn Treebank to Alto-compatible format | |||
|
|||
def format_tree(): | |||
with open(sys.argv[1]) as np_doc: | |||
for line in np_doc: | |||
np = Tree.fromstring(line) | |||
print_tree(np) | |||
print() | |||
|
|||
|
|||
def print_tree(tree): | |||
if tree.height() == 2: | |||
print("{}( {})".format(tree.label(), tree[0]), end="") #pos, word; Stanford format | |||
else: | |||
tree_len = len(tree) | |||
if tree.label() == "NP" and tree_len > 1: #NP2, NP3... | |||
tree.set_label("NP{}".format(tree_len)) | |||
|
|||
print("{}( ".format(tree.label()), end="") | |||
index = 1 | |||
for subtree in tree: | |||
print_tree(subtree) | |||
if tree_len > index: | |||
print(", ", end="") | |||
index += 1 | |||
print(")", end="") | |||
|
|||
format_tree() |
@@ -0,0 +1,15 @@ | |||
#!/usr/bin/env python3 | |||
|
|||
import sys | |||
|
|||
def line_filter(): | |||
seen = set() | |||
with open(sys.argv[1]) as np_doc: | |||
for line in np_doc: | |||
if line not in seen: | |||
seen.add(line) | |||
print(line, end="") | |||
|
|||
|
|||
|
|||
line_filter() |
@@ -0,0 +1,22 @@ | |||
#!/bin/bash | |||
|
|||
NPS="${1}" | |||
TMP_ORDERED="/tmp/tmp_ordered" | |||
TMP_ORDERED2="/tmp/tmp_ordered2" | |||
TMP_ORDERED3="/tmp/tmp_ordered3" | |||
|
|||
|
|||
# Sort input file in alphabetical order | |||
cat "${NPS}" | sort > "${TMP_ORDERED}" | |||
# width | |||
python sort_nps_width.py "${TMP_ORDERED}" | sort -n -s | cut -d" " -f2- > "${TMP_ORDERED2}" | |||
# depth | |||
python sort_nps_depth.py "${TMP_ORDERED2}" | sort -n -s | cut -d" " -f2- > "${TMP_ORDERED3}" | |||
|
|||
# Get rid of identical lines | |||
python3 line_filter.py "${TMP_ORDERED3}" | |||
|
|||
# Remove temporary files | |||
rm "${TMP_ORDERED}" | |||
rm "${TMP_ORDERED2}" | |||
rm "${TMP_ORDERED3}" |
@@ -0,0 +1,12 @@ | |||
#!/usr/bin/env python3 | |||
|
|||
import sys | |||
from nltk.tree import Tree | |||
|
|||
def sort_nps(): | |||
with open(sys.argv[1]) as np_doc: | |||
for line in np_doc: | |||
np = Tree.fromstring(line) | |||
print(np.height(), line, end ="") | |||
|
|||
sort_nps() |
@@ -0,0 +1,13 @@ | |||
#!/usr/bin/env python3 | |||
|
|||
import sys | |||
from nltk.tree import Tree | |||
|
|||
def sort_nps(): | |||
with open(sys.argv[1]) as np_doc: | |||
for line in np_doc: | |||
t = Tree.fromstring(line) | |||
width = len(t) | |||
print(width, line, end = "") | |||
|
|||
sort_nps() |
0 comments on commit
4c29314