Source code for pntl.tools

# encoding: utf-8
# Practical Natural Language Processing Tools (practNLPTools-lite):
#               Combination of Senna and Stanford dependency Extractor
# Copyright (C) 2017-19 PractNLP-lite Project
# Current Author: Jawahar S <jawahar273@gmail.com>
# URL: https://github.com/jawahar273


from __future__ import generators, print_function, unicode_literals
import subprocess

import os
from platform import architecture, system


try:
    from colorama import init
    from colorama.Fore import RED, BLUE

    init(autoreset=True)

except ImportError:
    RED = " "
    BLUE = " "


[docs]class Annotator: """ :Class:~pntl.Annotator is a class which holds the nessary function. """ def __init__( self, senna_dir="", stp_dir="", dep_model="edu.stanford.nlp.trees.EnglishGrammaticalStructure", raise_e=False, save_all=False, env=False, env_path="", ): """please replace the path/dirs of yours (according to Operating system's fromat) :param str senna_path: path for senna location \n :param str dep_model: stanford dependency parser model \t default='edu.stanford.nlp.trees.EnglishGrammaticalStructure' \n :param str or list sent: the sentence to process with Senna \n :param bool batch: processing more than one sentence in one row \n :param str stp_dir: location of stanford-parser.jar file :param bool init: downlard files from github. :param bool env: status for reading environment file. :param str env_path: location of the environment file. :param bool save_all: save into the database .. note:: The default file for environment variable is consider as `.env`. If you have `.env` in diffrent path then is it is good way to pass the location alone with file name. .. bash:: # for linux # /home/user_name/.env # for windows # C://user_name//.env # this is a example for idea purpose. """ self.senna_path = "" self.dep_par_path = "" self.end_point = None self.save_all = save_all # quick ref:- review needed if env: from dotenv import load_dotenv if not env_path: from os import getcwd from pathlib import Path env_path = Path(getcwd()) / ".env" load_dotenv(dotenv_path=env_path) if not senna_dir: if "SENNA" in os.environ: self.senna_path = os.path.normpath(os.environ["SENNA"]) self.senna_path + os.path.sep exe_file_2 = self.get_senna_bin(self.senna_path) if not os.path.isfile(exe_file_2): raise OSError( RED + "Senna executable expected at %s or" " %s but not found" % (self.senna_path, exe_file_2) ) elif senna_dir.startswith("."): self.senna_path = os.path.realpath(senna_dir) + os.path.sep else: self.senna_path = senna_dir.strip() self.senna_path = self.senna_path.rstrip(os.path.sep) + os.path.sep if not stp_dir: import pntl.tools as Tfile self.dep_par_path = Tfile.__file__.rsplit(os.path.sep, 1)[0] + os.path.sep self.check_stp_jar(self.dep_par_path, raise_e=True) else: self.dep_par_path = stp_dir + os.path.sep self.check_stp_jar(self.dep_par_path, raise_e) self.dep_par_model = dep_model # print(dep_model) self.default_jar_cli = [ "java", "-cp", "stanford-parser.jar", self.dep_par_model, "-treeFile", "in.parse", "-collapsed", ] self.print_values()
[docs] def print_values(self): """ displays the current set of values such as SENNA location, stanford parser jar, jar command interface """ print("**" * 50) print( "default values:\nsenna path:\n", self.senna_path, "\nDependencie parser:\n", self.dep_par_path, ) # print(self.default_jar_cli) print("Stanford parser clr", " ".join(self.default_jar_cli)) print("**" * 50)
[docs] def check_stp_jar(self, path, raise_e=False, _rec=True): """Check the stanford parser is present in the given directions and nested searching will be added in futurwork :param str path: path of where the stanford parser is present :param bool raise_e: to raise exception with user wise and default `False` don't raises exception :return: given path if it is valid one or return boolean `False` or if raise FileNotFoundError on raise_exp=True :rtype: bool """ gpath = path path = os.listdir(path) file_found = False for file in path: if file.endswith(".jar"): if file.startswith("stanford-parser"): file_found = True if not file_found: # need to check the install dir for stanfor parser if _rec: import pntl path_ = os.path.split(pntl.__file__)[0] self.check_stp_jar(path_, raise_e, _rec=False) if raise_e: raise FileNotFoundError( RED + "`stanford-parser.jar` is " "not" " found in the path \n" "`{}` \n" "To know about more about the issues," "got to this given link [" "http://pntl.readthedocs.io/en/" "latest/stanford_installing_" "issues.html] \n User " "`pntl -I true` to downlard " "needed file automatically.".format(gpath) ) return file_found
@property def stp_dir(self): """The return the path of stanford parser jar location and set the path for Dependency Parse at run time( this is python @property) """ return self.dep_par_path @stp_dir.setter def stp_dir(self, val): if os.path.isdir(val): self.dep_par_path = val + os.path.sep @property def senna_dir(self): """The return the path of senna location and set the path for senna at run time(this is python @property) :rtype: string """ return self.senna_path @senna_dir.setter def senna_dir(self, val): if os.path.isdir(val): self.senna_path = val + os.path.sep @property def jar_cli(self): """ The return cli for standford-parser.jar(this is python @property) :rtype: string """ return " ".join(self.default_jar_cli) @jar_cli.setter def jar_cli(self, val): self.default_jar_cli = val.split()
[docs] def get_senna_bin(self, os_name): """ get the current os executable binary file. :param str os_name: os name like Linux, Darwin, Windows :return: the corresponding exceutable object file of senna :rtype: str """ if os_name == "Linux": bits = architecture()[0] if bits == "64bit": executable = "senna-linux64" elif bits == "32bit": executable = "senna-linux32" else: executable = "senna" elif os_name == "Darwin": executable = "senna-osx" elif os_name == "Windows": executable = "senna-win32.exe" return self.senna_path + executable
[docs] @classmethod def help_conll_format(cls): """With the help of this method, detail of senna arguments are displayed """ return cls.get_conll_format.__doc__.split("\n\n")[1]
[docs] def get_conll_format(self, sentence, options="-srl -pos -ner -chk -psg"): """Communicates with senna through lower level communiction (sub process) and converted the console output(default is file writing) with CoNLL format and argument to be in `options` pass :param str or list: list of sentences for batch processes :param options list: list of arguments +--------------+-----------------------------------------------+ | options | desc | +==============+===============================================+ | -verbose | Display model informations (on the standard | | | error output, so it does not mess up the tag | | | outputs). | +--------------+-----------------------------------------------+ | -notokentags | Do not output tokens (first output column). | +--------------+-----------------------------------------------+ | -offsettags | Output start/end character offset (in the | | | sentence), for each token. | +--------------+-----------------------------------------------+ | -iobtags | Output IOB tags instead of IOBES. | +--------------+-----------------------------------------------+ | -brackettags | Output ‘bracket’ tags instead of IOBES. | +--------------+-----------------------------------------------+ | -path | Specify the path to the SENNA data and hash | | | directories, if you do not run SENNA in its | | | original directory. The path must end by “/”. | +--------------+-----------------------------------------------+ | -usrtokens | Use user’s tokens (space separated) instead | | | of SENNA tokenizer. | +--------------+-----------------------------------------------+ | -posvbs | Use verbs outputed by the POS tagger instead | | | of SRL style verbs for SRL task. You might | | | want to use this, as the SRL training task | | | ignore some verbs (many “be” and “have”) | | | which might be not what you want. | +--------------+-----------------------------------------------+ | -usrvbs | Use user’s verbs (given in ) instead of SENNA | | | verbs for SRL task. The file must contain one | | | line per token, with an empty line between | | | each sentence. A line which is not a “-” | | | corresponds to a verb. | +--------------+-----------------------------------------------+ | -pos | Instead of outputing tags for all tasks, | | | SENNA will output tags for the specified (one | | | or more) tasks. | +--------------+-----------------------------------------------+ | -chk | Instead of outputing tags for all tasks, | | | SENNA will output tags for the specified (one | | | or more) tasks. | +--------------+-----------------------------------------------+ | -ner | Instead of outputing tags for all tasks, | | | SENNA will output tags for the specified (one | | | or more) tasks. | +--------------+-----------------------------------------------+ | -srl | Instead of outputing tags for all tasks, | | | SENNA will output tags for the specified (one | | | or more) tasks. | +--------------+-----------------------------------------------+ | -psg | Instead of outputing tags for all tasks, | | | SENNA will output tags for the specified (one | | | or more) tasks. | +--------------+-----------------------------------------------+ :return: senna tagged output :rtype: str """ if isinstance(options, str): options = options.strip().split() input_data = sentence package_directory = os.path.dirname(self.senna_path) os_name = system() executable = self.get_senna_bin(os_name) senna_executable = os.path.join(executable) # print("testing dir", executable, package_directory) cwd = os.getcwd() os.chdir(package_directory) args = [senna_executable] args.extend(options) pipe = subprocess.Popen( args, stdout=subprocess.PIPE, stdin=subprocess.PIPE, shell=True ) senna_stdout = pipe.communicate(input=" ".join(input_data).encode("utf-8"))[0] os.chdir(cwd) return senna_stdout.decode("utf-8").strip()
[docs] def get_senna_tag(self, input_data): """ Communicates with senna through lower level communiction(sub process) and converted the console output(default is file writing) :param str/list input_data : list of sentences for batch processes :return: senna tagged output :rtype: str """ if isinstance(input_data, str): input_data = input_data.split() package_directory = os.path.dirname(self.senna_path) # print("testing dir",self.dep_par_path, package_directory) os_name = system() executable = self.get_senna_bin(os_name) senna_executable = os.path.join(package_directory, executable) cwd = os.getcwd() os.chdir(package_directory) pipe = subprocess.Popen( senna_executable, stdout=subprocess.PIPE, stdin=subprocess.PIPE, shell=True ) senna_stdout = pipe.communicate(input=" ".join(input_data).encode("utf-8"))[0] os.chdir(cwd) return senna_stdout
[docs] def get_senna_tag_batch(self, sentences): """ Communicates with senna through lower level communiction(sub process) and converted the console output(default is file writing). On batch processing each end is add with new line. :param list sentences: list of sentences for batch processes :rtype: str """ input_data = "" for sentence in sentences: input_data += sentence + "\n" input_data = input_data[:-1] package_directory = os.path.dirname(self.senna_path) os_name = system() executable = self.get_senna_bin(os_name) senna_executable = os.path.join(package_directory, executable) cwd = os.getcwd() os.chdir(package_directory) pipe = subprocess.Popen( senna_executable, stdout=subprocess.PIPE, stdin=subprocess.PIPE, shell=True ) senna_stdout = pipe.communicate(input=input_data.encode("utf-8"))[0] os.chdir(cwd) return senna_stdout.decode().split("\n\n")[0:-1]
[docs] def get_dependency(self, parse): """ Change to the Stanford parser direction and process the works :param str parse: parse is the input(tree format) and it is writen in as file :return: stanford dependency universal format :rtype: str """ # package_directory = os.path.dirname(self.dep_par_path) # cwd = os.getcwd() # os.chdir(package_directory) with open( self.senna_path + os.path.sep + "in.parse", "w", encoding="utf-8" ) as parsefile: parsefile.write(parse) pipe = subprocess.Popen( self.default_jar_cli, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, ) pipe.wait() stanford_out = pipe.stdout.read() # print(stanford_out, "\n", self.default_jar_cli) # os.chdir(cwd) return stanford_out.decode("utf-8").strip()
[docs] def get_batch_annotations(self, sentences, dep_parse=True): """ :param list sentences: list of sentences :rtype: list """ annotations = [] batch_senna_tags = self.get_senna_tag_batch(sentences) for senna_tags in batch_senna_tags: annotations += [self.get_annoations(senna_tags=senna_tags)] if dep_parse: syntax_tree = "" for annotation in annotations: syntax_tree += annotation["syntax_tree"] dependencies = self.get_dependency(syntax_tree).split("\n\n") # print (dependencies) if len(annotations) == len(dependencies): for dependencie, annotation in zip(dependencies, annotations): annotation["dep_parse"] = dependencie return annotations
[docs] def get_annoations(self, sentence="", senna_tags=None, dep_parse=True): """ passing the string to senna and performing aboue given nlp process and the returning them in a form of `dict()` :param str or list sentence: a sentence or list of sentence for nlp process. :param str or list senna_tags: this values are by SENNA processed string :param bool batch: the change the mode into batch processing process :param bool dep_parse: to tell the code and user need to communicate with stanford parser :return: the dict() of every out in the process such as ner, dep_parse, srl, verbs etc. :rtype: dict """ annotations = {} if not senna_tags: senna_tags = self.get_senna_tag(sentence).decode() senna_tags = [x.strip() for x in senna_tags.split("\n")] senna_tags = senna_tags[0:-2] else: senna_tags = [x.strip() for x in senna_tags.split("\n")] no_verbs = len(senna_tags[0].split("\t")) - 6 words = [] pos = [] chunk = [] ner = [] verb = [] srls = [] syn = [] for senna_tag in senna_tags: senna_tag = senna_tag.split("\t") words += [senna_tag[0].strip()] pos += [senna_tag[1].strip()] chunk += [senna_tag[2].strip()] ner += [senna_tag[3].strip()] verb += [senna_tag[4].strip()] srl = [] for i in range(5, 5 + no_verbs): srl += [senna_tag[i].strip()] srls += [tuple(srl)] syn += [senna_tag[-1]] roles = [] for j in range(no_verbs): role = {} i = 0 temp = "" curr_labels = [x[j] for x in srls] for curr_label in curr_labels: splits = curr_label.split("-") if splits[0] == "S": if len(splits) == 2: if splits[1] == "V": role[splits[1]] = words[i] else: if splits[1] in role: role[splits[1]] += " " + words[i] else: role[splits[1]] = words[i] elif len(splits) == 3: if splits[1] + "-" + splits[2] in role: role[splits[1] + "-" + splits[2]] += " " + words[i] else: role[splits[1] + "-" + splits[2]] = words[i] elif splits[0] == "B": temp = temp + " " + words[i] elif splits[0] == "I": temp = temp + " " + words[i] elif splits[0] == "E": temp = temp + " " + words[i] if len(splits) == 2: if splits[1] == "V": role[splits[1]] = temp.strip() else: if splits[1] in role: role[splits[1]] += " " + temp role[splits[1]] = role[splits[1]].strip() else: role[splits[1]] = temp.strip() elif len(splits) == 3: if splits[1] + "-" + splits[2] in role: role[splits[1] + "-" + splits[2]] += " " + temp role[splits[1] + "-" + splits[2]] = role[ splits[1] + "-" + splits[2] ].strip() else: role[splits[1] + "-" + splits[2]] = temp.strip() temp = "" i += 1 if "V" in role: roles += [role] annotations["words"] = words annotations["pos"] = list(zip(words, pos)) annotations["ner"] = list(zip(words, ner)) annotations["srl"] = roles annotations["chunk"] = [x for x in verb if x != "-"] annotations["verbs"] = list(zip(words, chunk)) annotations["dep_parse"] = "" annotations["syntax_tree"] = "" for (word_, syn_, pos_) in zip(words, syn, pos): annotations["syntax_tree"] += syn_.replace( "*", "(" + pos_ + " " + word_ + ")" ) # annotations['syntax_tree']=annotations['syntax_tree'].replace("S1","S") if dep_parse: annotations["dep_parse"] = self.get_dependency(annotations["syntax_tree"]) if self.save_all: from importlib import import_module # Initalizating the `EndPoint` Method end_point = import_module( f"{os.getenv('ENDPOINT_CLASS', default='snowbase.end_point.EntryPoint')}" )() self.__to_sql(annotations, end_point) return annotations
def __to_sql(self, annotations, end_point): end_point.insert(annotations) self.save(end_point)
[docs] def save(self, end_point): """Save is wrapper function build on the top of :Class:~snowbase.end_point.EntryPoint. """ end_point.save()