Permalink
Browse files

minor modifications

  • Loading branch information...
Eszti committed Jul 18, 2016
1 parent 6a25bcc commit 995baf14285d71b3462374361e3fe7d8b89c8e8e
Showing with 8 additions and 32 deletions.
  1. +1 −1 dep_to_4lang.txt
  2. +3 −3 src/fourlang/lemmatizer.py
  3. +2 −24 src/fourlang/sim_feats.py
  4. +2 −4 src/fourlang/similarity.py
@@ -488,7 +488,7 @@ nmod:except_for -,-
nmod:far_from -,-
nmod:following -,-
nmod:for -,- FOR
nmod:from -,- from
nmod:from -,-
nmod:given -,-
nmod:if -,-
nmod:in -,- IN
@@ -36,10 +36,10 @@ def _analyze(self, word):
self.cache[word] = (stem, lemma, candidates)

def _lemmatize_with_stopwords(self, word, uppercase):
if not uppercase:
return word
elif word == 'have':
if word == 'have':
return 'HAS'
elif not uppercase:
return word
elif word in self.stopwords:
return word.upper()
else:
@@ -103,7 +103,6 @@ def zero_connected(self, name1, links1, links1_expand, name2, links2, links2_exp
if val == -1:
if name1 in links2_expand or name2 in links1_expand:
val2 = 1
#ret.update({ "0-connected_exp" : val2 })
return ret

def is_antonym(self, name1, nodes1, name2, nodes2):
@@ -124,6 +123,8 @@ def fullgraph(self, name1, name2):
####################
if self.calc_path:
length = 0
if name1 not in self.UG.nodes() or name2 not in self.UG.nodes():
return {"shortest_path" : length}
if nx.has_path(self.UG, name1, name2):
path = nx.shortest_path(self.UG, name1, name2)
length = len(path)
@@ -177,29 +178,6 @@ def __init__(self, machine1, machine2, max_depth):
name1 = machine1.printname()
name2 = machine2.printname()

# TODO: hack
# G1_str = MachineGraph.create_from_machines([machine1], max_depth=max_depth, str_graph=True)
# G2_str = MachineGraph.create_from_machines([machine2], max_depth=max_depth, str_graph=True)
#
# print name1
# print G1.G.nodes()
# print G1_str.G.nodes()
# print G1.G.edges()
# print G1_str.G.edges()
# print name2
# print G2.G.nodes()
# print G2_str.G.nodes()
# print G2.G.edges()
# print G2_str.G.edges()
#
# if(name2 == 'intelligent'):
# G2_str.G = G2_str.G.to_undirected()
# print nx.shortest_path_length(G2_str.G, 'intelligent', 'intelligence')
# print G2_str.G.nodes(data=True)
# print G2_str.G.edges(data=True)

# TODO: end_hack

self.subgraph_dict = dict()
# self.subgraph_dict.update(self._get_subgraph_N(G1.G, G2.G, name1, name2))
# self.subgraph_dict.update(self._get_subgraph_N_X_N(G1.G, G2.G, name1, name2))
@@ -98,9 +98,8 @@ def lemma_similarities(self, lemma1, lemma2):
def word_similarities(self, word1, word2):
if (word1, word2) in self.word_sim_cache:
return self.word_sim_cache[(word1, word2)]
# TODO: uppercase flag = ?
lemma1, lemma2 = [self.lemmatizer.lemmatize(
word, defined=self.defined_words, stem_first=True)
word, defined=self.defined_words, stem_first=True, uppercase=True)
for word in (word1, word2)]
# self.log(u'lemmas: {0}, {1}'.format(lemma1, lemma2))
if lemma1 is None or lemma2 is None:
@@ -330,10 +329,9 @@ def get_sims(self):
logging.warning('lemmatizing words to determine machine-OOVs...')
self.non_oov = set(
(word for word in self.non_oov
# TODO: uppercase flag = ?
if self.sim_wrapper.lemmatizer.lemmatize(
word, defined=self.sim_wrapper.machine_wrapper.definitions,
stem_first=True) is not None))
stem_first=True, uppercase=True) is not None))

logging.warning(
'kept {0} words after discarding those not in machine sim'.format(

0 comments on commit 995baf1

Please sign in to comment.