"""Functions for converting lexemes into ULF symbols.
Relies on the NLTK Snowball stemmer.
"""
from nltk.stem.snowball import SnowballStemmer
import eta.util.file as file
from eta.util.general import atom
STEMMER = SnowballStemmer("english")
NAMES = file.load_json('resources/lexical/ulf/names.json', in_module=True)
NOUNS = file.load_json('resources/lexical/ulf/nouns.json', in_module=True)
VERBS = file.load_json('resources/lexical/ulf/verbs.json', in_module=True)
VERBS_PASV = file.load_json('resources/lexical/ulf/verbs_passive.json', in_module=True)
WH_PREDS = file.load_json('resources/lexical/ulf/wh_preds.json', in_module=True)
SUP_ADJS = file.load_json('resources/lexical/ulf/sup_adjs.json', in_module=True)
ADV_ADJS = file.load_json('resources/lexical/ulf/adv_adjs.json', in_module=True)
[docs]
def to_ulf(cat, word):
"""Convert a word to a ULF of a given type category.
This version is incomplete and intended for very limited experimental use.
Parameters
----------
cat : str
The ULF type category to generate (e.g., ``name``, ``v``, or ``p``).
word : str
The word to lexicalize (e.g., ``mercedes``, ``are``, or ``on-top-of``).
Returns
-------
s-expr
The generated ULF.
"""
if cat == 'name':
if word in NAMES:
return NAMES[word]
else:
return f'|{word.upper()} |' if len(word) < 4 else f"|{word.replace('_', ' ').title()}|"
if cat == 'pro':
return f'{word}.pro'
if cat == 'punc':
return word
if cat == 'nn':
return f'{word}.n'
if cat == 'nns':
stem = STEMMER.stem(word)
return ['plur', f'{stem}.n']
if cat in ['n', 'noun']:
if word in NOUNS:
return NOUNS[word]
else:
stem = STEMMER.stem(word)
# assume singular if unchanged (somewhat error-prone)
if stem == word:
return f'{word}.n'
else:
return ['plur', f'{stem}.n']
if cat in ['p', 'prep']:
return f'{word}.p'
if cat == 'ps':
return f'{word}.ps'
if cat in ['d', 'det']:
return f'{word}.d'
if cat in ['v', 'verb']:
if word in VERBS:
return VERBS[word]
else:
return f'{word}.v'
if cat in ['v-pasv', 'verb-passive']:
if word in VERBS_PASV:
return VERBS_PASV[word]
else:
return ['past', ['pasv', f'{word}.v']]
if cat in ['v-pasv-', 'verb-passive-']:
if word in VERBS_PASV:
return VERBS_PASV[word][1]
else:
return ['pasv', f'{word}.v']
if cat in ['v-', 'verb-untensed']:
# This is a bit hacky...
w1 = to_ulf('v', word)
if atom(w1):
return w1
else:
if w1[1] == f'{word}.v':
return f'{word}.v'
else:
return w1[1]
if cat == 'wh-pred':
if word in WH_PREDS:
return WH_PREDS[word]
else:
return f'{word}.adv'
if cat in ['a', 'adj']:
return f'{word}.a'
if cat == 'sup-adj':
if word in SUP_ADJS:
return SUP_ADJS[word]
else:
return f'{stem_superlative(word)}.a'
if cat == 'adv-adj':
if word in ADV_ADJS:
return ADV_ADJS[word]
else:
return f'{word}.a'
return f'{word}.{cat}'
# END to_ulf
[docs]
def stem_superlative(sup_adv):
"""Find stem of a superlative adjective.
This is a stub, intended as default when word-specific stems have not been
found in the calling program only a simple check whether the stem probably
shouldn't have a doubled up final consonant is made.
Parameters
----------
sup-adj : str
An upper-case symbolic atom such as ``biggest``, ``smartest``, etc.
Returns
-------
str
The stemmed superlative.
"""
stem = sup_adv[:-3]
if stem[-2:] in ['bb', 'dd', 'gg', 'nn', 'pp', 'tt']:
return stem[:-1]
else:
return stem