Permalink
Browse files

Merge pull request #159 from evelinacs/irtg

Add scripts for sorting NPs
  • Loading branch information...
recski committed Oct 1, 2018
2 parents 1c6c35c + 56ab664 commit 4c2931464b085388ea25de3f18223e97658c6166
@@ -0,0 +1,14 @@
#!/usr/bin/env python3

import sys
from nltk.tree import Tree

def sort_nps():
with open(sys.argv[1]) as np_doc:
for line in np_doc:
t = Tree.fromstring(line)
width = len(t)
if width <= 3:
print(line, end = "")

sort_nps()
@@ -0,0 +1,33 @@
#!/usr/bin/env python3

import sys
from nltk.tree import Tree

#Converts Penn Treebank to Alto-compatible format

def format_tree():
with open(sys.argv[1]) as np_doc:
for line in np_doc:
np = Tree.fromstring(line)
print_tree(np)
print()


def print_tree(tree):
if tree.height() == 2:
print("{}( {})".format(tree.label(), tree[0]), end="") #pos, word; Stanford format
else:
tree_len = len(tree)
if tree.label() == "NP" and tree_len > 1: #NP2, NP3...
tree.set_label("NP{}".format(tree_len))

print("{}( ".format(tree.label()), end="")
index = 1
for subtree in tree:
print_tree(subtree)
if tree_len > index:
print(", ", end="")
index += 1
print(")", end="")

format_tree()
@@ -0,0 +1,15 @@
#!/usr/bin/env python3

import sys

def line_filter():
seen = set()
with open(sys.argv[1]) as np_doc:
for line in np_doc:
if line not in seen:
seen.add(line)
print(line, end="")



line_filter()
@@ -0,0 +1,22 @@
#!/bin/bash

NPS="${1}"
TMP_ORDERED="/tmp/tmp_ordered"
TMP_ORDERED2="/tmp/tmp_ordered2"
TMP_ORDERED3="/tmp/tmp_ordered3"


# Sort input file in alphabetical order
cat "${NPS}" | sort > "${TMP_ORDERED}"
# width
python sort_nps_width.py "${TMP_ORDERED}" | sort -n -s | cut -d" " -f2- > "${TMP_ORDERED2}"
# depth
python sort_nps_depth.py "${TMP_ORDERED2}" | sort -n -s | cut -d" " -f2- > "${TMP_ORDERED3}"

# Get rid of identical lines
python3 line_filter.py "${TMP_ORDERED3}"

# Remove temporary files
rm "${TMP_ORDERED}"
rm "${TMP_ORDERED2}"
rm "${TMP_ORDERED3}"
@@ -0,0 +1,12 @@
#!/usr/bin/env python3

import sys
from nltk.tree import Tree

def sort_nps():
with open(sys.argv[1]) as np_doc:
for line in np_doc:
np = Tree.fromstring(line)
print(np.height(), line, end ="")

sort_nps()
@@ -0,0 +1,13 @@
#!/usr/bin/env python3

import sys
from nltk.tree import Tree

def sort_nps():
with open(sys.argv[1]) as np_doc:
for line in np_doc:
t = Tree.fromstring(line)
width = len(t)
print(width, line, end = "")

sort_nps()

0 comments on commit 4c29314

Please sign in to comment.