# encoding: utf-8
# Practical Natural Language Processing Tools (practNLPTools-lite):
# Combination of Senna and Stanford dependency Extractor
# Copyright (C) 2017-19 PractNLP-lite Project
# Current Author: Jawahar S <jawahar273@gmail.com>
# URL: https://github.com/jawahar273
from __future__ import generators, print_function, unicode_literals
import subprocess
import os
from platform import architecture, system
try:
from colorama import init
from colorama.Fore import RED, BLUE
init(autoreset=True)
except ImportError:
RED = " "
BLUE = " "
[docs]class Annotator:
"""
:Class:~pntl.Annotator is a class which
holds the nessary function.
"""
def __init__(
self,
senna_dir="",
stp_dir="",
dep_model="edu.stanford.nlp.trees.EnglishGrammaticalStructure",
raise_e=False,
save_all=False,
env=False,
env_path="",
):
"""please replace the path/dirs of yours
(according to Operating system's fromat)
:param str senna_path: path for senna location \n
:param str dep_model: stanford dependency parser model \t
default='edu.stanford.nlp.trees.EnglishGrammaticalStructure'
\n
:param str or list sent: the sentence to process with Senna \n
:param bool batch: processing more than one sentence
in one row \n
:param str stp_dir: location of stanford-parser.jar file
:param bool init: downlard files from github.
:param bool env: status for reading environment file.
:param str env_path: location of the environment file.
:param bool save_all: save into the database
.. note::
The default file for environment variable is consider
as `.env`. If you have `.env` in diffrent path then is
it is good way to pass the location alone with file name.
.. bash::
# for linux
# /home/user_name/.env
# for windows
# C://user_name//.env
# this is a example for idea purpose.
"""
self.senna_path = ""
self.dep_par_path = ""
self.end_point = None
self.save_all = save_all
# quick ref:- review needed
if env:
from dotenv import load_dotenv
if not env_path:
from os import getcwd
from pathlib import Path
env_path = Path(getcwd()) / ".env"
load_dotenv(dotenv_path=env_path)
if not senna_dir:
if "SENNA" in os.environ:
self.senna_path = os.path.normpath(os.environ["SENNA"])
self.senna_path + os.path.sep
exe_file_2 = self.get_senna_bin(self.senna_path)
if not os.path.isfile(exe_file_2):
raise OSError(
RED + "Senna executable expected at %s or"
" %s but not found" % (self.senna_path, exe_file_2)
)
elif senna_dir.startswith("."):
self.senna_path = os.path.realpath(senna_dir) + os.path.sep
else:
self.senna_path = senna_dir.strip()
self.senna_path = self.senna_path.rstrip(os.path.sep) + os.path.sep
if not stp_dir:
import pntl.tools as Tfile
self.dep_par_path = Tfile.__file__.rsplit(os.path.sep, 1)[0] + os.path.sep
self.check_stp_jar(self.dep_par_path, raise_e=True)
else:
self.dep_par_path = stp_dir + os.path.sep
self.check_stp_jar(self.dep_par_path, raise_e)
self.dep_par_model = dep_model
# print(dep_model)
self.default_jar_cli = [
"java",
"-cp",
"stanford-parser.jar",
self.dep_par_model,
"-treeFile",
"in.parse",
"-collapsed",
]
self.print_values()
[docs] def print_values(self):
""" displays the current set of values
such as SENNA location, stanford parser jar,
jar command interface
"""
print("**" * 50)
print(
"default values:\nsenna path:\n",
self.senna_path,
"\nDependencie parser:\n",
self.dep_par_path,
)
# print(self.default_jar_cli)
print("Stanford parser clr", " ".join(self.default_jar_cli))
print("**" * 50)
[docs] def check_stp_jar(self, path, raise_e=False, _rec=True):
"""Check the stanford parser is present in the given directions
and nested searching will be added in futurwork
:param str path: path of where the stanford parser is present
:param bool raise_e: to raise exception with user
wise and default `False` don't raises exception
:return: given path if it is valid one or return boolean `False` or
if raise FileNotFoundError on raise_exp=True
:rtype: bool
"""
gpath = path
path = os.listdir(path)
file_found = False
for file in path:
if file.endswith(".jar"):
if file.startswith("stanford-parser"):
file_found = True
if not file_found:
# need to check the install dir for stanfor parser
if _rec:
import pntl
path_ = os.path.split(pntl.__file__)[0]
self.check_stp_jar(path_, raise_e, _rec=False)
if raise_e:
raise FileNotFoundError(
RED + "`stanford-parser.jar` is "
"not"
" found in the path \n"
"`{}` \n"
"To know about more about the issues,"
"got to this given link ["
"http://pntl.readthedocs.io/en/"
"latest/stanford_installing_"
"issues.html] \n User "
"`pntl -I true` to downlard "
"needed file automatically.".format(gpath)
)
return file_found
@property
def stp_dir(self):
"""The return the path of stanford parser jar location
and set the path for Dependency Parse at run time(
this is python @property)
"""
return self.dep_par_path
@stp_dir.setter
def stp_dir(self, val):
if os.path.isdir(val):
self.dep_par_path = val + os.path.sep
@property
def senna_dir(self):
"""The return the path of senna location
and set the path for senna at run time(this is python @property)
:rtype: string
"""
return self.senna_path
@senna_dir.setter
def senna_dir(self, val):
if os.path.isdir(val):
self.senna_path = val + os.path.sep
@property
def jar_cli(self):
"""
The return cli for standford-parser.jar(this is python @property)
:rtype: string
"""
return " ".join(self.default_jar_cli)
@jar_cli.setter
def jar_cli(self, val):
self.default_jar_cli = val.split()
[docs] def get_senna_bin(self, os_name):
"""
get the current os executable binary file.
:param str os_name: os name like Linux, Darwin, Windows
:return: the corresponding exceutable object file of senna
:rtype: str
"""
if os_name == "Linux":
bits = architecture()[0]
if bits == "64bit":
executable = "senna-linux64"
elif bits == "32bit":
executable = "senna-linux32"
else:
executable = "senna"
elif os_name == "Darwin":
executable = "senna-osx"
elif os_name == "Windows":
executable = "senna-win32.exe"
return self.senna_path + executable
[docs] def get_senna_tag(self, input_data):
"""
Communicates with senna through lower level communiction(sub process)
and converted the console output(default is file writing)
:param str/list input_data : list of sentences for batch processes
:return: senna tagged output
:rtype: str
"""
if isinstance(input_data, str):
input_data = input_data.split()
package_directory = os.path.dirname(self.senna_path)
# print("testing dir",self.dep_par_path, package_directory)
os_name = system()
executable = self.get_senna_bin(os_name)
senna_executable = os.path.join(package_directory, executable)
cwd = os.getcwd()
os.chdir(package_directory)
pipe = subprocess.Popen(
senna_executable, stdout=subprocess.PIPE, stdin=subprocess.PIPE, shell=True
)
senna_stdout = pipe.communicate(input=" ".join(input_data).encode("utf-8"))[0]
os.chdir(cwd)
return senna_stdout
[docs] def get_senna_tag_batch(self, sentences):
"""
Communicates with senna through lower level communiction(sub process)
and converted the console output(default is file writing).
On batch processing each end is add with new line.
:param list sentences: list of sentences for batch processes
:rtype: str
"""
input_data = ""
for sentence in sentences:
input_data += sentence + "\n"
input_data = input_data[:-1]
package_directory = os.path.dirname(self.senna_path)
os_name = system()
executable = self.get_senna_bin(os_name)
senna_executable = os.path.join(package_directory, executable)
cwd = os.getcwd()
os.chdir(package_directory)
pipe = subprocess.Popen(
senna_executable, stdout=subprocess.PIPE, stdin=subprocess.PIPE, shell=True
)
senna_stdout = pipe.communicate(input=input_data.encode("utf-8"))[0]
os.chdir(cwd)
return senna_stdout.decode().split("\n\n")[0:-1]
[docs] def get_dependency(self, parse):
"""
Change to the Stanford parser direction and process the works
:param str parse: parse is the input(tree format)
and it is writen in as file
:return: stanford dependency universal format
:rtype: str
"""
# package_directory = os.path.dirname(self.dep_par_path)
# cwd = os.getcwd()
# os.chdir(package_directory)
with open(
self.senna_path + os.path.sep + "in.parse", "w", encoding="utf-8"
) as parsefile:
parsefile.write(parse)
pipe = subprocess.Popen(
self.default_jar_cli,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
shell=True,
)
pipe.wait()
stanford_out = pipe.stdout.read()
# print(stanford_out, "\n", self.default_jar_cli)
# os.chdir(cwd)
return stanford_out.decode("utf-8").strip()
[docs] def get_batch_annotations(self, sentences, dep_parse=True):
"""
:param list sentences: list of sentences
:rtype: list
"""
annotations = []
batch_senna_tags = self.get_senna_tag_batch(sentences)
for senna_tags in batch_senna_tags:
annotations += [self.get_annoations(senna_tags=senna_tags)]
if dep_parse:
syntax_tree = ""
for annotation in annotations:
syntax_tree += annotation["syntax_tree"]
dependencies = self.get_dependency(syntax_tree).split("\n\n")
# print (dependencies)
if len(annotations) == len(dependencies):
for dependencie, annotation in zip(dependencies, annotations):
annotation["dep_parse"] = dependencie
return annotations
[docs] def get_annoations(self, sentence="", senna_tags=None, dep_parse=True):
"""
passing the string to senna and performing aboue given nlp process
and the returning them in a form of `dict()`
:param str or list sentence: a sentence or list of
sentence for nlp process.
:param str or list senna_tags: this values are by
SENNA processed string
:param bool batch: the change the mode into batch
processing process
:param bool dep_parse: to tell the code and user need
to communicate with stanford parser
:return: the dict() of every out in the process
such as ner, dep_parse, srl, verbs etc.
:rtype: dict
"""
annotations = {}
if not senna_tags:
senna_tags = self.get_senna_tag(sentence).decode()
senna_tags = [x.strip() for x in senna_tags.split("\n")]
senna_tags = senna_tags[0:-2]
else:
senna_tags = [x.strip() for x in senna_tags.split("\n")]
no_verbs = len(senna_tags[0].split("\t")) - 6
words = []
pos = []
chunk = []
ner = []
verb = []
srls = []
syn = []
for senna_tag in senna_tags:
senna_tag = senna_tag.split("\t")
words += [senna_tag[0].strip()]
pos += [senna_tag[1].strip()]
chunk += [senna_tag[2].strip()]
ner += [senna_tag[3].strip()]
verb += [senna_tag[4].strip()]
srl = []
for i in range(5, 5 + no_verbs):
srl += [senna_tag[i].strip()]
srls += [tuple(srl)]
syn += [senna_tag[-1]]
roles = []
for j in range(no_verbs):
role = {}
i = 0
temp = ""
curr_labels = [x[j] for x in srls]
for curr_label in curr_labels:
splits = curr_label.split("-")
if splits[0] == "S":
if len(splits) == 2:
if splits[1] == "V":
role[splits[1]] = words[i]
else:
if splits[1] in role:
role[splits[1]] += " " + words[i]
else:
role[splits[1]] = words[i]
elif len(splits) == 3:
if splits[1] + "-" + splits[2] in role:
role[splits[1] + "-" + splits[2]] += " " + words[i]
else:
role[splits[1] + "-" + splits[2]] = words[i]
elif splits[0] == "B":
temp = temp + " " + words[i]
elif splits[0] == "I":
temp = temp + " " + words[i]
elif splits[0] == "E":
temp = temp + " " + words[i]
if len(splits) == 2:
if splits[1] == "V":
role[splits[1]] = temp.strip()
else:
if splits[1] in role:
role[splits[1]] += " " + temp
role[splits[1]] = role[splits[1]].strip()
else:
role[splits[1]] = temp.strip()
elif len(splits) == 3:
if splits[1] + "-" + splits[2] in role:
role[splits[1] + "-" + splits[2]] += " " + temp
role[splits[1] + "-" + splits[2]] = role[
splits[1] + "-" + splits[2]
].strip()
else:
role[splits[1] + "-" + splits[2]] = temp.strip()
temp = ""
i += 1
if "V" in role:
roles += [role]
annotations["words"] = words
annotations["pos"] = list(zip(words, pos))
annotations["ner"] = list(zip(words, ner))
annotations["srl"] = roles
annotations["chunk"] = [x for x in verb if x != "-"]
annotations["verbs"] = list(zip(words, chunk))
annotations["dep_parse"] = ""
annotations["syntax_tree"] = ""
for (word_, syn_, pos_) in zip(words, syn, pos):
annotations["syntax_tree"] += syn_.replace(
"*", "(" + pos_ + " " + word_ + ")"
)
# annotations['syntax_tree']=annotations['syntax_tree'].replace("S1","S")
if dep_parse:
annotations["dep_parse"] = self.get_dependency(annotations["syntax_tree"])
if self.save_all:
from importlib import import_module
# Initalizating the `EndPoint` Method
end_point = import_module(
f"{os.getenv('ENDPOINT_CLASS', default='snowbase.end_point.EntryPoint')}"
)()
self.__to_sql(annotations, end_point)
return annotations
def __to_sql(self, annotations, end_point):
end_point.insert(annotations)
self.save(end_point)
[docs] def save(self, end_point):
"""Save is wrapper function build on
the top of :Class:~snowbase.end_point.EntryPoint.
"""
end_point.save()