module_types
generate.LDA.for.tokens_array
Documentation
Perform Latent Dirichlet Allocation on a tokenized
corpus.
This module computes models for a range of number
of topics provided by the user.
Author(s)
Markus Binsteiner markus@frkl.io
Context
Tags language_processing, LDA, tokens
Labels package:
kiara_plugin.language_processing
References source_repo:
https://github.com/DHARPA-Project/kia…
documentation:
https://DHARPA-Project.github.io/kiar…
Module config schema
Field Type Descript… Required Default
─────────────────────────────────────────────────────
constants object Value no
constants
for this
module.
defaults object Value no
defaults
for this
module.
Python class
python_class_name LDAModule
python_module_name kiara_plugin.language_process…
full_name kiara_plugin.language_process…
Processing source code ─────────────────────────────────────────────────────
class LDAModule(KiaraModule):
"""Perform Latent Dirichlet Allocation on a to…
This module computes models for a range of num…
"""
_module_type_name = "generate.LDA.for.tokens_a…
KIARA_METADATA = {
"tags": ["LDA", "tokens"],
}
def create_inputs_schema(
self,
) -> ValueSetSchema:
inputs: Dict[str, Dict[str, Any]] = {
"tokens_array": {"type": "array", "doc…
"num_topics_min": {
"type": "integer",
"doc": "The minimal number of topi…
"default": 7,
},
"num_topics_max": {
"type": "integer",
"doc": "The max number of topics.",
"default": 7,
"optional": True,
},
"compute_coherence": {
"type": "boolean",
"doc": "Whether to compute the coh…
"default": False,
},
"words_per_topic": {
"type": "integer",
"doc": "How many words per topic t…
"default": 10,
},
}
return inputs
def create_outputs_schema(
self,
) -> ValueSetSchema:
outputs: Mapping[str, Mapping[str, Any]] =…
"topic_models": {
"type": "dict",
"doc": "A dictionary with one cohe…
},
"coherence_table": {
"type": "table",
"doc": "Coherence details.",
"optional": True,
},
"coherence_map": {
"type": "dict",
"doc": "A map with the coherence v…
},
}
return outputs
def create_model(self, corpus, num_topics: int…
from gensim.models import LdaModel
model = LdaModel(
corpus, id2word=id2word, num_topics=nu…
)
return model
def compute_coherence(self, model, corpus_mode…
from gensim.models import CoherenceModel
coherencemodel = CoherenceModel(
model=model,
texts=corpus_model,
dictionary=id2word,
coherence="c_v",
processes=1,
)
coherence_value = coherencemodel.get_coher…
return coherence_value
def assemble_coherence(self, models_dict: Mapp…
import pandas as pd
import pyarrow as pa
# Create list with topics and topic words …
num_topics_list = []
topics_list = []
for (
num_topics,
model,
) in models_dict.items():
num_topics_list.append(num_topics)
topic_print = model.print_topics(num_w…
topics_list.append(topic_print)
df_coherence_table = pd.DataFrame(columns=…
idx = 0
for i in range(len(topics_list)):
for j in range(len(topics_list[i])):
df_coherence_table.loc[idx] = ""
df_coherence_table["topic_id"].loc…
df_coherence_table["words"].loc[id…
re.findall(r'"(\w+)"', topics_…
)
df_coherence_table["num_topics"].l…
idx += 1
coherence_table = pa.Table.from_pandas(df_…
return coherence_table
def process(self, inputs: ValueMap, outputs: V…
from gensim import corpora
logging.getLogger("gensim").setLevel(loggi…
tokens_array: KiaraArray = inputs.get_valu…
tokens = tokens_array.arrow_array.to_pylis…
words_per_topic = inputs.get_value_data("w…
num_topics_min = inputs.get_value_data("nu…
num_topics_max = inputs.get_value_data("nu…
if not num_topics_max:
num_topics_max = num_topics_min
if num_topics_max < num_topics_min:
raise KiaraProcessingException(
"The max number of topics must be …
)
compute_coherence = inputs.get_value_data(…
id2word = corpora.Dictionary(tokens)
corpus = [id2word.doc2bow(text) for text i…
# model = gensim.models.ldamulticore.LdaMu…
# corpus, id2word=id2word, num_topics=…
# )
models = {}
model_tables = {}
coherence = {}
# multi_threaded = False
# if not multi_threaded:
for nt in range(num_topics_min, num_topics…
model = self.create_model(corpus=corpu…
models[nt] = model
topic_print_model = model.print_topics…
# dbg(topic_print_model)
# df = pd.DataFrame(topic_print_model,…
# TODO: create table directly
# result_table = Table.from_pandas(df)
model_tables[nt] = topic_print_model
if compute_coherence:
coherence_result = self.compute_co…
model=model, corpus_model=toke…
)
coherence[nt] = coherence_result
# else:
# def create_model(num_topics):
# model = self.create_model(corpus…
# topic_print_model = model.print_…
# df = pd.DataFrame(topic_print_mo…
# # TODO: create table directly
# result_table = Table.from_pandas…
# coherence_result = None
# if compute_coherence:
# coherence_result = self.comp…
# return (num_topics, model, resul…
#
# executor = ThreadPoolExecutor()
# results: typing.Any = executor.map(c…
# executor.shutdown(wait=True)
# for r in results:
# models[r[0]] = r[1]
# model_tables[r[0]] = r[2]
# if compute_coherence:
# coherence[r[0]] = r[3]
# df_coherence = pd.DataFrame(coherence.ke…
# df_coherence["Coherence"] = coherence.va…
if compute_coherence:
coherence_table = self.assemble_cohere…
models_dict=models, words_per_topi…
)
else:
coherence_table = None
coherence_map = {k: v.item() for k, v in c…
outputs.set_values(
topic_models=model_tables,
coherence_table=coherence_table,
coherence_map=coherence_map,
)
─────────────────────────────────────────────────────
tokenize.string
Documentation
Tokenize a string.
Author(s)
Markus Binsteiner markus@frkl.io
Context
Tags language_processing
Labels package:
kiara_plugin.language_processing
References source_repo:
https://github.com/DHARPA-Project/kia…
documentation:
https://DHARPA-Project.github.io/kiar…
Module config schema
Field Type Descript… Required Default
─────────────────────────────────────────────────────
constan… object Value no
constants
for this
module.
defaults object Value no
defaults
for this
module.
filter_… boolean Whether no true
to filter
out non
alpha
tokens.
min_tok… integer The no 3
minimum
token
length.
to_lowe… boolean Whether no true
to
lowercase
the
tokens.
Python class
python_class_name TokenizeTextModule
python_module_name kiara_plugin.language_process…
full_name kiara_plugin.language_process…
Processing source code ─────────────────────────────────────────────────────
class TokenizeTextModule(KiaraModule):
"""Tokenize a string."""
_config_cls = TokenizeTextConfig
_module_type_name = "tokenize.string"
def create_inputs_schema(
self,
) -> ValueSetSchema:
inputs = {"text": {"type": "string", "doc"…
return inputs
def create_outputs_schema(
self,
) -> ValueSetSchema:
outputs = {
"token_list": {
"type": "list",
"doc": "The tokenized version of t…
}
}
return outputs
def process(self, inputs: ValueMap, outputs: V…
import nltk
get_stopwords()
# TODO: module-independent caching?
# language = inputs.get_value_data("langua…
#
text = inputs.get_value_data("text")
tokenized = nltk.word_tokenize(text)
result = tokenized
if self.get_config_value("min_token_length…
result = (
x
for x in tokenized
if len(x) >= self.get_config_value…
)
if self.get_config_value("filter_non_alpha…
result = (x for x in result if x.isalp…
if self.get_config_value("to_lowercase"):
result = (x.lower() for x in result)
outputs.set_value("token_list", list(resul…
─────────────────────────────────────────────────────
tokenize.texts_array
Documentation
Split sentences into words or words into
characters.
In other words, this operation establishes the word
boundaries (i.e., tokens) a very helpful way of
finding patterns. It is also the typical step prior
to stemming and lemmatization
Author(s)
Markus Binsteiner markus@frkl.io
Context
Tags language_processing, tokenize, tokens
Labels package:
kiara_plugin.language_processing
References source_repo:
https://github.com/DHARPA-Project/kia…
documentation:
https://DHARPA-Project.github.io/kiar…
Module config schema
Field Type Descript… Required Default
─────────────────────────────────────────────────────
constants object Value no
constants
for this
module.
defaults object Value no
defaults
for this
module.
Python class
python_class_name TokenizeTextArrayeModule
python_module_name kiara_plugin.language_process…
full_name kiara_plugin.language_process…
Processing source code ─────────────────────────────────────────────────────
class TokenizeTextArrayeModule(KiaraModule):
"""Split sentences into words or words into ch…
In other words, this operation establishes the…
"""
_module_type_name = "tokenize.texts_array"
KIARA_METADATA = {
"tags": ["tokenize", "tokens"],
}
def create_inputs_schema(
self,
) -> ValueSetSchema:
return {
"texts_array": {
"type": "array",
"doc": "An array of text items to …
},
"tokenize_by_word": {
"type": "boolean",
"doc": "Whether to tokenize by wor…
"default": True,
},
}
def create_outputs_schema(
self,
) -> ValueSetSchema:
return {
"tokens_array": {
"type": "array",
"doc": "The tokenized content, as …
}
}
def process(self, inputs: ValueMap, outputs: V…
import nltk
import polars as pl
import pyarrow as pa
get_stopwords()
array: KiaraArray = inputs.get_value_data(…
# for text in array.arrow_array:
# print("----")
# print(len(str(text)))
tokenize_by_word: bool = inputs.get_value_…
if not tokenize_by_word:
raise KiaraProcessingException(
"Non-word tokenization is not yet …
)
column: pa.ChunkedArray = array.arrow_arra…
# warnings.filterwarnings("ignore", catego…
def word_tokenize(word):
return nltk.word_tokenize(word)
series = pl.Series(name="tokens", values=c…
result = series.apply(word_tokenize)
result_array = result.to_arrow()
# TODO: remove this cast once the array da…
chunked = pa.chunked_array(result_array)
outputs.set_values(tokens_array=chunked)
─────────────────────────────────────────────────────
create.stopwords_list
Documentation
Create a list of stopwords from one or multiple
sources.
This will download nltk stopwords if necessary, and
merge all input lists into a single, sorted list
without duplicates.
Author(s)
Markus Binsteiner markus@frkl.io
Context
Tags language_processing
Labels package:
kiara_plugin.language_processing
References source_repo:
https://github.com/DHARPA-Project/kia…
documentation:
https://DHARPA-Project.github.io/kiar…
Module config schema
Field Type Descript… Required Default
─────────────────────────────────────────────────────
constants object Value no
constants
for this
module.
defaults object Value no
defaults
for this
module.
Python class
python_class_name AssembleStopwordsModule
python_module_name kiara_plugin.language_process…
full_name kiara_plugin.language_process…
Processing source code ─────────────────────────────────────────────────────
class AssembleStopwordsModule(KiaraModule):
"""Create a list of stopwords from one or mult…
This will download nltk stopwords if necessary…
"""
_module_type_name = "create.stopwords_list"
def create_inputs_schema(
self,
) -> ValueSetSchema:
return {
"languages": {
"type": "list",
"doc": "A list of languages, will …
"optional": True,
},
"stopwords": {
"type": "list",
"doc": "A list of additional, cust…
"optional": True,
},
}
def create_outputs_schema(
self,
) -> ValueSetSchema:
return {
"stopwords_list": {
"type": "list",
"doc": "A sorted list of unique st…
}
}
def process(self, inputs: ValueMap, outputs: V…
stopwords = set()
_languages = inputs.get_value_obj("languag…
if _languages.is_set:
all_stopwords = get_stopwords()
languages: KiaraList = _languages.data
for language in languages.list_data:
if language not in all_stopwords.f…
raise KiaraProcessingException(
f"Invalid language: {langu…
)
stopwords.update(all_stopwords.wor…
_stopword_lists = inputs.get_value_obj("st…
if _stopword_lists.is_set:
stopword_lists: KiaraList = _stopword_…
for stopword_list in stopword_lists.li…
if isinstance(stopword_list, str):
stopwords.add(stopword_list)
else:
stopwords.update(stopword_list)
outputs.set_value("stopwords_list", sorted…
─────────────────────────────────────────────────────
preprocess.tokens_array
Documentation
Preprocess lists of tokens, incl. lowercasing,
remove special characers, etc.
Lowercasing: Lowercase the words. This operation is
a double-edged sword. It can be effective at
yielding potentially better results in the case of
relatively small datasets or datatsets with a high
percentage of OCR mistakes. For instance, if
lowercasing is not performed, the algorithm will
treat USA, Usa, usa, UsA, uSA, etc. as distinct
tokens, even though they may all refer to the same
entity. On the other hand, if the dataset does not
contain such OCR mistakes, then it may become
difficult to distinguish between homonyms and make
interpreting the topics much harder.
Removing stopwords and words with less than three
characters: Remove low information words. These are
typically words such as articles, pronouns,
prepositions, conjunctions, etc. which are not
semantically salient. There are numerous stopword
lists available for many, though not all, languages
which can be easily adapted to the individual
researcher's needs. Removing words with less than
three characters may additionally remove many OCR
mistakes. Both these operations have the dual
advantage of yielding more reliable results while
reducing the size of the dataset, thus in turn
reducing the required processing power. This step
can therefore hardly be considered optional in TM.
Noise removal: Remove elements such as punctuation
marks, special characters, numbers, html
formatting, etc. This operation is again concerned
with removing elements that may not be relevant to
the text analysis and in fact interfere with it.
Depending on the dataset and research question,
this operation can become essential.
Author(s)
Markus Binsteiner markus@frkl.io
Context
Tags language_processing, tokens,
preprocess
Labels package:
kiara_plugin.language_processing
References source_repo:
https://github.com/DHARPA-Project/kia…
documentation:
https://DHARPA-Project.github.io/kiar…
Module config schema
Field Type Descript… Required Default
─────────────────────────────────────────────────────
constants object Value no
constants
for this
module.
defaults object Value no
defaults
for this
module.
Python class
python_class_name PreprocessModule
python_module_name kiara_plugin.language_process…
full_name kiara_plugin.language_process…
Processing source code ─────────────────────────────────────────────────────
class PreprocessModule(KiaraModule):
"""Preprocess lists of tokens, incl. lowercasi…
Lowercasing: Lowercase the words. This operati…
Removing stopwords and words with less than th…
Noise removal: Remove elements such as punctua…
"""
_module_type_name = "preprocess.tokens_array"
KIARA_METADATA = {
"tags": ["tokens", "preprocess"],
}
def create_inputs_schema(
self,
) -> ValueSetSchema:
return {
"tokens_array": {
"type": "array",
"doc": "The tokens array to pre-pr…
},
"to_lowercase": {
"type": "boolean",
"doc": "Apply lowercasing to the t…
"default": False,
},
"remove_alphanumeric": {
"type": "boolean",
"doc": "Remove all tokens that inc…
"default": False,
},
"remove_non_alpha": {
"type": "boolean",
"doc": "Remove all tokens that inc…
"default": False,
},
"remove_all_numeric": {
"type": "boolean",
"doc": "Remove all tokens that con…
"default": False,
},
"remove_short_tokens": {
"type": "integer",
"doc": "Remove tokens shorter or e…
"default": 0,
},
"remove_stopwords": {
"type": "list",
"doc": "Remove stopwords.",
"optional": True,
},
}
def create_outputs_schema(
self,
) -> ValueSetSchema:
return {
"tokens_array": {
"type": "array",
"doc": "The pre-processed content,…
}
}
def process(self, inputs: ValueMap, outputs: V…
import polars as pl
import pyarrow as pa
tokens_array: KiaraArray = inputs.get_valu…
lowercase: bool = inputs.get_value_data("t…
remove_alphanumeric: bool = inputs.get_val…
remove_non_alpha: bool = inputs.get_value_…
remove_all_numeric: bool = inputs.get_valu…
remove_short_tokens: int = inputs.get_valu…
if remove_short_tokens is None:
remove_short_tokens = -1
_remove_stopwords = inputs.get_value_obj("…
if _remove_stopwords.is_set:
stopword_list: Union[Iterable[str], No…
else:
stopword_list = None
# it's better to have one method every tok…
# because that way each token only needs t…
def check_token(token: str) -> Union[str, …
# remove short tokens first, since we …
assert isinstance(remove_short_tokens,…
if remove_short_tokens > 0:
if len(token) <= remove_short_toke…
return None
_token: str = token
if lowercase:
_token = _token.lower()
if remove_non_alpha:
match = _token if _token.isalpha()…
if match is None:
return None
# if remove_non_alpha was set, we don'…
if remove_alphanumeric and not remove_…
match = _token if _token.isalnum()…
if match is None:
return None
# all-number tokens are already filter…
if remove_all_numeric and not remove_n…
match = None if _token.isdigit() e…
if match is None:
return None
if stopword_list and _token and _token…
return None
return _token
series = pl.Series(name="tokens", values=t…
result = series.apply(
lambda token_list: [
x for x in (check_token(token) for…
]
)
result_array = result.to_arrow()
# TODO: remove this cast once the array da…
chunked = pa.chunked_array(result_array)
outputs.set_values(tokens_array=chunked)
─────────────────────────────────────────────────────