Permalink
Browse files

eliminated jython module, now using corenlp with dict_to_4lang, fixed #…

  • Loading branch information...
Gabor Recski
Gabor Recski committed Apr 20, 2017
1 parent c6361f4 commit 1d6e68459b6ad31f776add11bdeae95244a33515
@@ -24,5 +24,5 @@
"https://github.com/kornai/pymachine/tarball/master#egg=pymachine"],
install_requires=[
"nltk", "pymachine", "requests", "stemming", "unidecode", "pyzmq",
"graphviz"],
"graphviz", "scipy"],
)
@@ -7,9 +7,11 @@

from longman_parser import XMLParser


class ParserException(Exception):
pass


class Parser(XMLParser):
sen_regex = re.compile(
'<sentence id="[0-9]*">(.*?)</sentence>', re.S)
@@ -69,7 +71,8 @@ def parse_corenlp_output(output):
parsed_sens = [Parser.parse_sen(sen)
for sen in Parser.sen_regex.findall(cl_output)]

parse_trees = [match for match in Parser.parse_tree_regex.findall(cl_output)]
parse_trees = [match
for match in Parser.parse_tree_regex.findall(cl_output)]

corefs_match = Parser.all_corefs_regex.search(cl_output)
if corefs_match is None:
@@ -78,6 +81,7 @@ def parse_corenlp_output(output):
corefs = Parser.parse_corefs(corefs_match.group(1))
return parsed_sens, corefs, parse_trees


class CoreNLPWrapper():

def __init__(self, cfg, is_server=False):
@@ -94,6 +98,18 @@ def parse_text(self, text):
def parse_sentences(self, sens):
return self.parse_text("\n".join(sens))

def parse_entries(self, entries):
for entry in entries:
for sense in entry['senses']:
sentence = sense['definition']
deps, corefs, parse_trees = self.parse_text(sentence)
sense['definition'] = {
"sen": sentence,
"deps": deps[0],
"parse": parse_trees}
return entries


def test():
cfg_file = 'conf/default.cfg' if len(sys.argv) < 2 else sys.argv[1]
cfg = ConfigParser()
@@ -130,13 +130,15 @@ def get_root_lemmas(self, deps):
for d in deps if d['type'] == 'root'] # TODO

def get_dep_definition(self, word, deps):
if isinstance(deps[0], unicode):
# TODO
root_lemmas = self.get_root_lemmas(
NewDependencies.create_from_old_deps(
Dependencies.create_from_strings(deps)).deps)
else:
root_lemmas = self.get_root_lemmas(deps)
# get NewDependencies from whatever type "deps" are
if isinstance(deps[0], unicode): # string dependencies
deps = NewDependencies.create_from_old_deps(
Dependencies.create_from_strings(deps)).deps
elif isinstance(deps[0], list): # old dependencies
deps = NewDependencies.create_from_old_deps(
Dependencies(deps)).deps

root_lemmas = self.get_root_lemmas(deps)
deps = self.dependency_processor.process_dependencies(deps)
if not root_lemmas:
logging.warning(
@@ -8,17 +8,17 @@
import time
import traceback

from collins_parser import CollinsParser
from corenlp_wrapper import CoreNLPWrapper
from dep_to_4lang import DepTo4lang
from eksz_parser import EkszParser
from entry_preprocessor import EntryPreprocessor
from lexicon import Lexicon
from longman_parser import LongmanParser
from wiktionary_parser import WiktParser
from stanford_wrapper import StanfordWrapper
from utils import batches, ensure_dir, get_cfg
from collins_parser import CollinsParser
from eksz_parser import EkszParser
from nszt_parser import NSzTParser
from magyarlanc_wrapper import Magyarlanc
from nszt_parser import NSzTParser
from utils import batches, ensure_dir, get_cfg
from wiktionary_parser import WiktParser

assert Lexicon # silence pyflakes (Lexicon must be imported for cPickle)

@@ -86,9 +86,8 @@ def process_entries(self, words):
(self.raw_dict[word] for word in words))

if self.lang == 'eng':
stanford_wrapper = StanfordWrapper(self.cfg)
entries = stanford_wrapper.parse_sentences(
entries, definitions=True)
corenlp_wrapper = CoreNLPWrapper(self.cfg)
entries = corenlp_wrapper.parse_entries(entries)
elif self.lang == 'hun':
magyarlanc_wrapper = Magyarlanc(self.cfg)
entries = magyarlanc_wrapper.parse_entries(entries)

This file was deleted.

Oops, something went wrong.
Oops, something went wrong.

0 comments on commit 1d6e684

Please sign in to comment.