module_types
generate.LDA.for.tokens_array
Documentation
Perform Latent Dirichlet Allocation on a tokenized
corpus.
This module computes models for a range of number
of topics provided by the user.
Author(s)
Markus Binsteiner markus@frkl.io
Context
Tags language_processing, LDA, tokens
Labels package:
kiara_plugin.language_processing
References source_repo:
https://github.com/DHARPA-Project/kia…
documentation:
https://DHARPA-Project.github.io/kiar…
Module config schema
Field Type Descript… Required Default
─────────────────────────────────────────────────────
constants object Value no
constants
for this
module.
defaults object Value no
defaults
for this
module.
Python class
python_class_name LDAModule
python_module_name kiara_plugin.language_process…
full_name kiara_plugin.language_process…
Processing source code ─────────────────────────────────────────────────────
def process(self, inputs: ValueMap, outputs: Value…
from gensim import corpora
logging.getLogger("gensim").setLevel(logging.E…
tokens_array: KiaraArray = inputs.get_value_da…
tokens = tokens_array.arrow_array.to_pylist()
words_per_topic = inputs.get_value_data("words…
num_topics_min = inputs.get_value_data("num_to…
num_topics_max = inputs.get_value_data("num_to…
if not num_topics_max:
num_topics_max = num_topics_min
if num_topics_max < num_topics_min:
raise KiaraProcessingException(
"The max number of topics must be larg…
)
compute_coherence = inputs.get_value_data("com…
id2word = corpora.Dictionary(tokens)
corpus = [id2word.doc2bow(text) for text in to…
# model = gensim.models.ldamulticore.LdaMultic…
# corpus, id2word=id2word, num_topics=num_…
# )
models = {}
model_tables = {}
coherence = {}
# multi_threaded = False
# if not multi_threaded:
for nt in range(num_topics_min, num_topics_max…
model = self.create_model(corpus=corpus, n…
models[nt] = model
topic_print_model = model.print_topics(num…
# dbg(topic_print_model)
# df = pd.DataFrame(topic_print_model, col…
# TODO: create table directly
# result_table = Table.from_pandas(df)
model_tables[nt] = topic_print_model
if compute_coherence:
coherence_result = self.compute_cohere…
model=model, corpus_model=tokens, …
)
coherence[nt] = coherence_result
# else:
# def create_model(num_topics):
# model = self.create_model(corpus=cor…
# topic_print_model = model.print_topi…
# df = pd.DataFrame(topic_print_model,…
# # TODO: create table directly
# result_table = Table.from_pandas(df)
# coherence_result = None
# if compute_coherence:
# coherence_result = self.compute_…
# return (num_topics, model, result_ta…
#
# executor = ThreadPoolExecutor()
# results: typing.Any = executor.map(creat…
# executor.shutdown(wait=True)
# for r in results:
# models[r[0]] = r[1]
# model_tables[r[0]] = r[2]
# if compute_coherence:
# coherence[r[0]] = r[3]
# df_coherence = pd.DataFrame(coherence.keys()…
# df_coherence["Coherence"] = coherence.values…
if compute_coherence:
coherence_table = self.assemble_coherence(
models_dict=models, words_per_topic=wo…
)
else:
coherence_table = None
coherence_map = {k: v.item() for k, v in coher…
outputs.set_values(
topic_models=model_tables,
coherence_table=coherence_table,
coherence_map=coherence_map,
)
─────────────────────────────────────────────────────
tokenize.string
Documentation
Tokenize a string.
Author(s)
Markus Binsteiner markus@frkl.io
Context
Tags language_processing
Labels package:
kiara_plugin.language_processing
References source_repo:
https://github.com/DHARPA-Project/kia…
documentation:
https://DHARPA-Project.github.io/kiar…
Module config schema
Field Type Descript… Required Default
─────────────────────────────────────────────────────
constan… object Value no
constants
for this
module.
defaults object Value no
defaults
for this
module.
filter_… boolean Whether no true
to filter
out non
alpha
tokens.
min_tok… integer The no 3
minimum
token
length.
to_lowe… boolean Whether no true
to
lowercase
the
tokens.
Python class
python_class_name TokenizeTextModule
python_module_name kiara_plugin.language_process…
full_name kiara_plugin.language_process…
Processing source code ─────────────────────────────────────────────────────
def process(self, inputs: ValueMap, outputs: Value…
import nltk
# TODO: module-independent caching?
# language = inputs.get_value_data("language")
#
text = inputs.get_value_data("text")
tokenized = nltk.word_tokenize(text)
result = tokenized
if self.get_config_value("min_token_length") >…
result = (
x
for x in tokenized
if len(x) >= self.get_config_value("mi…
)
if self.get_config_value("filter_non_alpha"):
result = (x for x in result if x.isalpha())
if self.get_config_value("to_lowercase"):
result = (x.lower() for x in result)
outputs.set_value("token_list", list(result))
─────────────────────────────────────────────────────
tokenize.texts_array
Documentation
Split sentences into words or words into
characters.
In other words, this operation establishes the word
boundaries (i.e., tokens) a very helpful way of
finding patterns. It is also the typical step prior
to stemming and lemmatization
Author(s)
Markus Binsteiner markus@frkl.io
Context
Tags language_processing, tokenize, tokens
Labels package:
kiara_plugin.language_processing
References source_repo:
https://github.com/DHARPA-Project/kia…
documentation:
https://DHARPA-Project.github.io/kiar…
Module config schema
Field Type Descript… Required Default
─────────────────────────────────────────────────────
constants object Value no
constants
for this
module.
defaults object Value no
defaults
for this
module.
Python class
python_class_name TokenizeTextArrayeModule
python_module_name kiara_plugin.language_process…
full_name kiara_plugin.language_process…
Processing source code ─────────────────────────────────────────────────────
def process(self, inputs: ValueMap, outputs: Value…
pass
import nltk
import polars as pl
import pyarrow as pa
array: KiaraArray = inputs.get_value_data("tex…
# tokenize_by_word: bool = inputs.get_value_da…
column: pa.ChunkedArray = array.arrow_array
# warnings.filterwarnings("ignore", category=n…
def word_tokenize(word):
result = nltk.word_tokenize(word)
return result
series = pl.Series(name="tokens", values=colum…
result = series.apply(word_tokenize)
result_array = result.to_arrow()
# TODO: remove this cast once the array data t…
chunked = pa.chunked_array(result_array)
outputs.set_values(tokens_array=chunked)
─────────────────────────────────────────────────────
create.stopwords_list
Documentation
Create a list of stopwords from one or multiple
sources.
This will download nltk stopwords if necessary, and
merge all input lists into a single, sorted list
without duplicates.
Author(s)
Markus Binsteiner markus@frkl.io
Context
Tags language_processing
Labels package:
kiara_plugin.language_processing
References source_repo:
https://github.com/DHARPA-Project/kia…
documentation:
https://DHARPA-Project.github.io/kiar…
Module config schema
Field Type Descript… Required Default
─────────────────────────────────────────────────────
constants object Value no
constants
for this
module.
defaults object Value no
defaults
for this
module.
Python class
python_class_name AssembleStopwordsModule
python_module_name kiara_plugin.language_process…
full_name kiara_plugin.language_process…
Processing source code ─────────────────────────────────────────────────────
def process(self, inputs: ValueMap, outputs: Value…
stopwords = set()
_languages = inputs.get_value_obj("languages")
if _languages.is_set:
all_stopwords = get_stopwords()
languages: ListModel = _languages.data
for language in languages.list_data:
if language not in all_stopwords.filei…
raise KiaraProcessingException(
f"Invalid language: {language}…
)
stopwords.update(get_stopwords().words…
_stopword_lists = inputs.get_value_obj("stopwo…
if _stopword_lists.is_set:
stopword_lists: ListModel = _stopword_list…
for stopword_list in stopword_lists.list_d…
if isinstance(stopword_list, str):
stopwords.add(stopword_list)
else:
stopwords.update(stopword_list)
outputs.set_value("stopwords_list", sorted(sto…
─────────────────────────────────────────────────────
remove_stopwords.from.tokens_array
Documentation
Remove stopwords from an array of token-lists.
Author(s)
Markus Binsteiner markus@frkl.io
Context
Tags language_processing
Labels package:
kiara_plugin.language_processing
References source_repo:
https://github.com/DHARPA-Project/kia…
documentation:
https://DHARPA-Project.github.io/kiar…
Module config schema
Field Type Descript… Required Default
─────────────────────────────────────────────────────
constants object Value no
constants
for this
module.
defaults object Value no
defaults
for this
module.
Python class
python_class_name RemoveStopwordsModule
python_module_name kiara_plugin.language_process…
full_name kiara_plugin.language_process…
Processing source code ─────────────────────────────────────────────────────
def process(self, inputs: ValueMap, outputs: Value…
import pyarrow as pa
custom_stopwords = inputs.get_value_data("addi…
if inputs.get_value_obj("languages").is_set:
_languages: ListModel = inputs.get_value_d…
languages = _languages.list_data
else:
languages = []
stopwords = set()
if languages:
for language in languages:
if language not in get_stopwords().fil…
raise KiaraProcessingException(
f"Invalid language: {language}…
)
stopwords.update(get_stopwords().words…
if custom_stopwords:
stopwords.update(custom_stopwords)
orig_array = inputs.get_value_obj("tokens_arra…
if not stopwords:
outputs.set_value("tokens_array", orig_arr…
return
# if hasattr(orig_array, "to_pylist"):
# token_lists = orig_array.to_pylist()
tokens_array = orig_array.data.arrow_array
# TODO: use vaex for this
result = []
for token_list in tokens_array:
cleaned_list = [x for x in token_list.as_p…
result.append(cleaned_list)
outputs.set_value("tokens_array", pa.chunked_a…
─────────────────────────────────────────────────────
preprocess.tokens_array
Documentation
Preprocess lists of tokens, incl. lowercasing,
remove special characers, etc.
Lowercasing: Lowercase the words. This operation is
a double-edged sword. It can be effective at
yielding potentially better results in the case of
relatively small datasets or datatsets with a high
percentage of OCR mistakes. For instance, if
lowercasing is not performed, the algorithm will
treat USA, Usa, usa, UsA, uSA, etc. as distinct
tokens, even though they may all refer to the same
entity. On the other hand, if the dataset does not
contain such OCR mistakes, then it may become
difficult to distinguish between homonyms and make
interpreting the topics much harder.
Removing stopwords and words with less than three
characters: Remove low information words. These are
typically words such as articles, pronouns,
prepositions, conjunctions, etc. which are not
semantically salient. There are numerous stopword
lists available for many, though not all, languages
which can be easily adapted to the individual
researcher's needs. Removing words with less than
three characters may additionally remove many OCR
mistakes. Both these operations have the dual
advantage of yielding more reliable results while
reducing the size of the dataset, thus in turn
reducing the required processing power. This step
can therefore hardly be considered optional in TM.
Noise removal: Remove elements such as punctuation
marks, special characters, numbers, html
formatting, etc. This operation is again concerned
with removing elements that may not be relevant to
the text analysis and in fact interfere with it.
Depending on the dataset and research question,
this operation can become essential.
Author(s)
Markus Binsteiner markus@frkl.io
Context
Tags language_processing, tokens,
preprocess
Labels package:
kiara_plugin.language_processing
References source_repo:
https://github.com/DHARPA-Project/kia…
documentation:
https://DHARPA-Project.github.io/kiar…
Module config schema
Field Type Descript… Required Default
─────────────────────────────────────────────────────
constants object Value no
constants
for this
module.
defaults object Value no
defaults
for this
module.
Python class
python_class_name PreprocessModule
python_module_name kiara_plugin.language_process…
full_name kiara_plugin.language_process…
Processing source code ─────────────────────────────────────────────────────
def process(self, inputs: ValueMap, outputs: Value…
import polars as pl
import pyarrow as pa
tokens_array: KiaraArray = inputs.get_value_da…
lowercase: bool = inputs.get_value_data("to_lo…
remove_alphanumeric: bool = inputs.get_value_d…
remove_non_alpha: bool = inputs.get_value_data…
remove_all_numeric: bool = inputs.get_value_da…
remove_short_tokens: int = inputs.get_value_da…
if remove_short_tokens is None:
remove_short_tokens = -1
_remove_stopwords = inputs.get_value_obj("remo…
if _remove_stopwords.is_set:
stopword_list: Union[Iterable[str], None] …
else:
stopword_list = None
# it's better to have one method every token g…
# because that way each token only needs to be…
def check_token(token: str) -> Union[str, None…
# remove short tokens first, since we can …
assert isinstance(remove_short_tokens, int)
if remove_short_tokens > 0:
if len(token) <= remove_short_tokens:
return None
_token: str = token
if lowercase:
_token = _token.lower()
if remove_non_alpha:
match = _token if _token.isalpha() els…
if match is None:
return None
# if remove_non_alpha was set, we don't ne…
if remove_alphanumeric and not remove_non_…
match = _token if _token.isalnum() els…
if match is None:
return None
# all-number tokens are already filtered o…
if remove_all_numeric and not remove_non_a…
match = None if _token.isdigit() else …
if match is None:
return None
if stopword_list and _token and _token.low…
return None
return _token
series = pl.Series(name="tokens", values=token…
result = series.apply(
lambda token_list: [
x for x in (check_token(token) for tok…
]
)
result_array = result.to_arrow()
# TODO: remove this cast once the array data t…
chunked = pa.chunked_array(result_array)
outputs.set_values(tokens_array=chunked)
─────────────────────────────────────────────────────