module_types

`generate.LDA.for.tokens_array`¶

type_name

generate.LDA.for.tokens_array

The registered name for this item type.

documentation

Perform Latent Dirichlet Allocation on a tokenized corpus.

This module computes models for a range of number of topics provided by the user.

Documentation for the item.

authors

Markus Binsteiner ( markus@frkl.io )

Information about authorship for the item.

context

tags	language_processing,LDA,tokens
labels	package: kiara_plugin.language_processing
references	source_repo: The module package git repository. documentation: The url for the module package documentation.

Generic properties of this item (description, tags, labels, references, ...).

python_class

python_class_name	LDAModule	The name of the Python class.
python_module_name	kiara_plugin.language_processing.modules.lda	The name of the Python module this class lives in.
full_name	kiara_plugin.language_processing.modules.lda.LDAModule	The full class namespace.

The python class that implements this module type.

process_src

def process(self, inputs: ValueMap, outputs: ValueMap) -> None: from gensim import corpora logging.getLogger("gensim").setLevel(logging.ERROR) tokens_array: KiaraArray = inputs.get_value_data("tokens_array") tokens = tokens_array.arrow_array.to_pylist() words_per_topic = inputs.get_value_data("words_per_topic") num_topics_min = inputs.get_value_data("num_topics_min") num_topics_max = inputs.get_value_data("num_topics_max") if num_topics_max is None: num_topics_max = num_topics_min compute_coherence = inputs.get_value_data("compute_coherence") id2word = corpora.Dictionary(tokens) corpus = [id2word.doc2bow(text) for text in tokens] # model = gensim.models.ldamulticore.LdaMulticore( # corpus, id2word=id2word, num_topics=num_topics, eval_every=None # ) models = {} model_tables = {} coherence = {} # multi_threaded = False # if not multi_threaded: for nt in range(num_topics_min, num_topics_max + 1): model = self.create_model(corpus=corpus, num_topics=nt, id2word=id2word) models[nt] = model topic_print_model = model.print_topics(num_words=words_per_topic) # dbg(topic_print_model) # df = pd.DataFrame(topic_print_model, columns=["topic_id", "words"]) # TODO: create table directly # result_table = Table.from_pandas(df) model_tables[nt] = topic_print_model if compute_coherence: coherence_result = self.compute_coherence( model=model, corpus_model=tokens, id2word=id2word ) coherence[nt] = coherence_result # else: # def create_model(num_topics): # model = self.create_model(corpus=corpus, num_topics=num_topics, id2word=id2word) # topic_print_model = model.print_topics(num_words=30) # df = pd.DataFrame(topic_print_model, columns=["topic_id", "words"]) # # TODO: create table directly # result_table = Table.from_pandas(df) # coherence_result = None # if compute_coherence: # coherence_result = self.compute_coherence(model=model, corpus_model=tokens, id2word=id2word) # return (num_topics, model, result_table, coherence_result) # # executor = ThreadPoolExecutor() # results: typing.Any = executor.map(create_model, range(num_topics_min, num_topics_max+1)) # executor.shutdown(wait=True) # for r in results: # models[r[0]] = r[1] # model_tables[r[0]] = r[2] # if compute_coherence: # coherence[r[0]] = r[3] # df_coherence = pd.DataFrame(coherence.keys(), columns=["Number of topics"]) # df_coherence["Coherence"] = coherence.values() if compute_coherence: coherence_table = self.assemble_coherence( models_dict=models, words_per_topic=words_per_topic ) else: coherence_table = None coherence_map = {k: v.item() for k, v in coherence.items()} outputs.set_values( topic_models=model_tables, coherence_table=coherence_table, coherence_map=coherence_map, )

The source code of the process method of the module.

`tokenize.string`¶

type_name

tokenize.string

The registered name for this item type.

documentation

Tokenize a string.

Documentation for the item.

authors

Markus Binsteiner ( markus@frkl.io )

Information about authorship for the item.

context

tags	language_processing
labels	package: kiara_plugin.language_processing
references	source_repo: The module package git repository. documentation: The url for the module package documentation.

Generic properties of this item (description, tags, labels, references, ...).

python_class

python_class_name	TokenizeTextModule	The name of the Python class.
python_module_name	kiara_plugin.language_processing.modules.tokens	The name of the Python module this class lives in.
full_name	kiara_plugin.language_processing.modules.tokens.TokenizeTextModule	The full class namespace.

The python class that implements this module type.

process_src

def process(self, inputs: ValueMap, outputs: ValueMap) -> None: import nltk # TODO: module-independent caching? # language = inputs.get_value_data("language") # text = inputs.get_value_data("text") tokenized = nltk.word_tokenize(text) result = tokenized if self.get_config_value("min_token_length") > 0: result = ( x for x in tokenized if len(x) >= self.get_config_value("min_token_length") ) if self.get_config_value("filter_non_alpha"): result = (x for x in result if x.isalpha()) if self.get_config_value("to_lowercase"): result = (x.lower() for x in result) outputs.set_value("token_list", list(result))

The source code of the process method of the module.

`tokenize.texts_array`¶

type_name

tokenize.texts_array

The registered name for this item type.

documentation

Split sentences into words or words into characters.

In other words, this operation establishes the word boundaries (i.e., tokens) a very helpful way of finding patterns. It is also the typical step prior to stemming and lemmatization

Documentation for the item.

authors

Markus Binsteiner ( markus@frkl.io )

Information about authorship for the item.

context

tags	language_processing,tokenize,tokens
labels	package: kiara_plugin.language_processing
references	source_repo: The module package git repository. documentation: The url for the module package documentation.

Generic properties of this item (description, tags, labels, references, ...).

python_class

python_class_name	TokenizeTextArrayeModule	The name of the Python class.
python_module_name	kiara_plugin.language_processing.modules.tokens	The name of the Python module this class lives in.
full_name	kiara_plugin.language_processing.modules.tokens.TokenizeTextArrayeModule	The full class namespace.

The python class that implements this module type.

process_src

def process(self, inputs: ValueMap, outputs: ValueMap): pass import nltk import polars as pl import pyarrow as pa array: KiaraArray = inputs.get_value_data("texts_array") # tokenize_by_word: bool = inputs.get_value_data("tokenize_by_word") column: pa.ChunkedArray = array.arrow_array # warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) def word_tokenize(word): result = nltk.word_tokenize(word) return result series = pl.Series(name="tokens", values=column) result = series.apply(word_tokenize) result_array = result.to_arrow() # TODO: remove this cast once the array data type can handle non-chunked arrays chunked = pa.chunked_array(result_array) outputs.set_values(tokens_array=chunked)

The source code of the process method of the module.

`create.stopwords_list`¶

type_name

create.stopwords_list

The registered name for this item type.

documentation

Create a list of stopwords from one or multiple sources.

This will download nltk stopwords if necessary, and merge all input lists into a single, sorted list without duplicates.

Documentation for the item.

authors

Markus Binsteiner ( markus@frkl.io )

Information about authorship for the item.

context

tags	language_processing
labels	package: kiara_plugin.language_processing
references	source_repo: The module package git repository. documentation: The url for the module package documentation.

Generic properties of this item (description, tags, labels, references, ...).

python_class

python_class_name	AssembleStopwordsModule	The name of the Python class.
python_module_name	kiara_plugin.language_processing.modules.tokens	The name of the Python module this class lives in.
full_name	kiara_plugin.language_processing.modules.tokens.AssembleStopwordsModule	The full class namespace.

The python class that implements this module type.

process_src

def process(self, inputs: ValueMap, outputs: ValueMap): stopwords = set() _languages = inputs.get_value_obj("languages") if _languages.is_set: all_stopwords = get_stopwords() languages: ListModel = _languages.data for language in languages.list_data: if language not in all_stopwords.fileids(): raise KiaraProcessingException( f"Invalid language: {language}. Available: {', '.join(all_stopwords.fileids())}." ) stopwords.update(get_stopwords().words(language)) _stopword_lists = inputs.get_value_obj("stopword_lists") if _stopword_lists.is_set: stopword_lists: ListModel = _stopword_lists.data for stopword_list in stopword_lists.list_data: if isinstance(stopword_list, str): stopwords.add(stopword_list) else: stopwords.update(stopword_list) outputs.set_value("stopwords_list", sorted(stopwords))

The source code of the process method of the module.

`remove_stopwords.from.tokens_array`¶

type_name

remove_stopwords.from.tokens_array

The registered name for this item type.

documentation

Remove stopwords from an array of token-lists.

Documentation for the item.

authors

Markus Binsteiner ( markus@frkl.io )

Information about authorship for the item.

context

tags	language_processing
labels	package: kiara_plugin.language_processing
references	source_repo: The module package git repository. documentation: The url for the module package documentation.

Generic properties of this item (description, tags, labels, references, ...).

python_class

python_class_name	RemoveStopwordsModule	The name of the Python class.
python_module_name	kiara_plugin.language_processing.modules.tokens	The name of the Python module this class lives in.
full_name	kiara_plugin.language_processing.modules.tokens.RemoveStopwordsModule	The full class namespace.

The python class that implements this module type.

process_src

def process(self, inputs: ValueMap, outputs: ValueMap) -> None: import pyarrow as pa custom_stopwords = inputs.get_value_data("additional_stopwords") if inputs.get_value_obj("languages").is_set: _languages: ListModel = inputs.get_value_data("languages") languages = _languages.list_data else: languages = [] stopwords = set() if languages: for language in languages: if language not in get_stopwords().fileids(): raise KiaraProcessingException( f"Invalid language: {language}. Available: {', '.join(get_stopwords().fileids())}." ) stopwords.update(get_stopwords().words(language)) if custom_stopwords: stopwords.update(custom_stopwords) orig_array = inputs.get_value_obj("tokens_array") # type: ignore if not stopwords: outputs.set_value("tokens_array", orig_array) return # if hasattr(orig_array, "to_pylist"): # token_lists = orig_array.to_pylist() tokens_array = orig_array.data.arrow_array # TODO: use vaex for this result = [] for token_list in tokens_array: cleaned_list = [x for x in token_list.as_py() if x.lower() not in stopwords] result.append(cleaned_list) outputs.set_value("tokens_array", pa.chunked_array(pa.array(result)))

The source code of the process method of the module.

`preprocess.tokens_array`¶

type_name

preprocess.tokens_array

The registered name for this item type.

documentation

Preprocess lists of tokens, incl. lowercasing, remove special characers, etc.

Lowercasing: Lowercase the words. This operation is a double-edged sword. It can be effective at yielding potentially better results in the case of relatively small datasets or datatsets with a high percentage of OCR mistakes. For instance, if lowercasing is not performed, the algorithm will treat USA, Usa, usa, UsA, uSA, etc. as distinct tokens, even though they may all refer to the same entity. On the other hand, if the dataset does not contain such OCR mistakes, then it may become difficult to distinguish between homonyms and make interpreting the topics much harder.

Removing stopwords and words with less than three characters: Remove low information words. These are typically words such as articles, pronouns, prepositions, conjunctions, etc. which are not semantically salient. There are numerous stopword lists available for many, though not all, languages which can be easily adapted to the individual researcher's needs. Removing words with less than three characters may additionally remove many OCR mistakes. Both these operations have the dual advantage of yielding more reliable results while reducing the size of the dataset, thus in turn reducing the required processing power. This step can therefore hardly be considered optional in TM.

Noise removal: Remove elements such as punctuation marks, special characters, numbers, html formatting, etc. This operation is again concerned with removing elements that may not be relevant to the text analysis and in fact interfere with it. Depending on the dataset and research question, this operation can become essential.

Documentation for the item.

authors

Markus Binsteiner ( markus@frkl.io )

Information about authorship for the item.

context

tags	language_processing,tokens,preprocess
labels	package: kiara_plugin.language_processing
references	source_repo: The module package git repository. documentation: The url for the module package documentation.

Generic properties of this item (description, tags, labels, references, ...).

python_class

python_class_name	PreprocessModule	The name of the Python class.
python_module_name	kiara_plugin.language_processing.modules.tokens	The name of the Python module this class lives in.
full_name	kiara_plugin.language_processing.modules.tokens.PreprocessModule	The full class namespace.

The python class that implements this module type.

process_src

def process(self, inputs: ValueMap, outputs: ValueMap): import polars as pl import pyarrow as pa tokens_array: KiaraArray = inputs.get_value_data("tokens_array") lowercase: bool = inputs.get_value_data("to_lowercase") remove_alphanumeric: bool = inputs.get_value_data("remove_alphanumeric") remove_non_alpha: bool = inputs.get_value_data("remove_non_alpha") remove_all_numeric: bool = inputs.get_value_data("remove_all_numeric") remove_short_tokens: int = inputs.get_value_data("remove_short_tokens") if remove_short_tokens is None: remove_short_tokens = -1 _remove_stopwords = inputs.get_value_obj("remove_stopwords") if _remove_stopwords.is_set: stopword_list: Optional[Iterable[str]] = _remove_stopwords.data.list_data else: stopword_list = None # it's better to have one method every token goes through, then do every test seperately for the token list # because that way each token only needs to be touched once (which is more effective) def check_token(token: str) -> Optional[str]: # remove short tokens first, since we can save ourselves all the other checks (which are more expensive) if remove_short_tokens > 0: if len(token) <= remove_short_tokens: return None _token: str = token if lowercase: _token = _token.lower() if remove_non_alpha: match = _token if _token.isalpha() else None if match is None: return None # if remove_non_alpha was set, we don't need to worry about tokens that include numbers, since they are already filtered out if remove_alphanumeric and not remove_non_alpha: match = _token if _token.isalnum() else None if match is None: return None # all-number tokens are already filtered out if the remove_non_alpha methods above ran if remove_all_numeric and not remove_non_alpha: match = None if _token.isdigit() else _token if match is None: return None if stopword_list and _token and _token.lower() in stopword_list: return None return _token series = pl.Series(name="tokens", values=tokens_array.arrow_array) result = series.apply( lambda token_list: [ x for x in (check_token(token) for token in token_list) if x is not None ] ) result_array = result.to_arrow() # TODO: remove this cast once the array data type can handle non-chunked arrays chunked = pa.chunked_array(result_array) outputs.set_values(tokens_array=chunked)

The source code of the process method of the module.

module_types

generate.LDA.for.tokens_array¶

tokenize.string¶

tokenize.texts_array¶

create.stopwords_list¶

remove_stopwords.from.tokens_array¶

preprocess.tokens_array¶

`generate.LDA.for.tokens_array`¶

`tokenize.string`¶

`tokenize.texts_array`¶

`create.stopwords_list`¶

`remove_stopwords.from.tokens_array`¶

`preprocess.tokens_array`¶