%env CONSOLE_WIDTH=140
from kiara.interfaces.python_api.workflow import Workflow
from kiara.utils.jupyter import graph_to_image
from kiara.utils.cli import terminal_print_model
env: CONSOLE_WIDTH=140
doc = """Example topic-modeling end-to-end workflow."""
workflow = Workflow.create("topic_modeling", doc=doc, replace_existing_alias=True)
# Creating step: import_text_corpus
workflow.add_step(operation="import.file_bundle", step_id="import_text_corpus")
╭─ Step: import_text_corpus ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ │ │ step_id import_text_corpus │ │ module type import.file_bundle │ │ module doc Import a folder (file_bundle) from the local filesystem. │ │ inputs │ │ field name type description Required Default │ │ ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ import_text_corpus.path string The local path of the folder to import. yes -- no default -- │ │ │ │ outputs │ │ field name type description │ │ ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ import_text_corpus.file_bundle file_bundle The imported file bundle. │ │ │ │ │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
# Creating step: create_stopwords_list
workflow.add_step(operation="create.stopwords_list", step_id="create_stopwords_list")
╭─ Step: create_stopwords_list ────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ │ │ step_id create_stopwords_list │ │ module type create.stopwords_list │ │ module doc Create a list of stopwords from one or multiple sources. │ │ │ │ This will download nltk stopwords if necessary, and merge all input lists into a single, sorted list without │ │ duplicates. │ │ inputs │ │ field name type description Required Default │ │ ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ create_stopwords_list.languages list A list of languages, will be used to no -- no default -- │ │ retrieve language-specific stopword │ │ from nltk. │ │ create_stopwords_list.stopword_lists list A list of lists of stopwords. no -- no default -- │ │ │ │ outputs │ │ field name type description │ │ ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ create_stopwords_list.stopwords_list list A sorted list of unique stopwords. │ │ │ │ │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
# Creating step: create_text_corpus
step_create_text_corpus_config = {'constants': {}, 'defaults': {}, 'source_type': 'text_file_bundle', 'target_type': 'table', 'ignore_errors': False}
workflow.add_step(
operation="create.table",
module_config=step_create_text_corpus_config,
step_id="create_text_corpus")
╭─ Step: create_text_corpus ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ │ │ step_id create_text_corpus │ │ module type create.table │ │ module_config { │ │ "source_type": "text_file_bundle", │ │ "target_type": "table", │ │ "ignore_errors": false │ │ } │ │ module doc -- n/a -- │ │ inputs │ │ field name type description Required Default │ │ ────────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ create_text_corpus.text_file_b text_file_bundle The type of the source value. yes -- no default -- │ │ undle │ │ │ │ outputs │ │ field name type description │ │ ────────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ create_text_corpus.table table The result value. │ │ │ │ │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
# Connecting input(s) of step 'create_text_corpus'
workflow.connect_fields("create_text_corpus.text_file_bundle", "import_text_corpus.file_bundle")
# Creating step: extract_texts_column
workflow.add_step(operation="table.cut_column", step_id="extract_texts_column")
╭─ Step: extract_texts_column ─────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ │ │ step_id extract_texts_column │ │ module type table.cut_column │ │ module doc Cut off one column from a table, returning an array. │ │ inputs │ │ field name type description Required Default │ │ ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ extract_texts_column.table table A table. yes -- no default -- │ │ extract_texts_column.column_name string The name of the column to extract. yes -- no default -- │ │ │ │ outputs │ │ field name type description │ │ ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ extract_texts_column.array array The column. │ │ │ │ │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
# Connecting input(s) of step 'extract_texts_column'
workflow.connect_fields("extract_texts_column.table", "create_text_corpus.table")
# Creating step: extract_filename_column
workflow.add_step(operation="table.cut_column", step_id="extract_filename_column")
╭─ Step: extract_filename_column ──────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ │ │ step_id extract_filename_column │ │ module type table.cut_column │ │ module doc Cut off one column from a table, returning an array. │ │ inputs │ │ field name type description Required Default │ │ ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ extract_filename_column.table table A table. yes -- no default -- │ │ extract_filename_column.column_name string The name of the column to extract. yes -- no default -- │ │ │ │ outputs │ │ field name type description │ │ ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ extract_filename_column.array array The column. │ │ │ │ │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
# Connecting input(s) of step 'extract_filename_column'
workflow.connect_fields("extract_filename_column.table", "create_text_corpus.table")
# Creating step: create_date_array
workflow.add_step(operation="parse.date_array", step_id="create_date_array")
╭─ Step: create_date_array ────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ │ │ step_id create_date_array │ │ module type parse.date_array │ │ module_config { │ │ "add_inputs": true, │ │ "input_fields": [], │ │ "force_non_null": true, │ │ "min_index": null, │ │ "max_index": null, │ │ "remove_tokens": [] │ │ } │ │ module doc Create an array of date objects from an array of strings. │ │ │ │ This module is very simplistic at the moment, more functionality and options will be added in the future. │ │ │ │ At its core, this module uses the standard parser from the dateutil package to parse strings into dates. As this │ │ parser can't handle complex strings, the input strings can be pre-processed in the following ways: │ │ │ │ • 'cut' non-relevant parts of the string (using 'min_index' & 'max_index' input/config options) │ │ • remove matching tokens from the string, and replace them with a single whitespace (using the 'remove_tokens' │ │ option) │ │ │ │ By default, if an input string can't be parsed this module will raise an exception. This can be prevented by setting │ │ this modules 'force_non_null' config option or input to 'False', in which case un-parsable strings will appear as │ │ 'NULL' value in the resulting array. │ │ inputs │ │ field name type description Required Default │ │ ────────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ create_date_array.array array The input array. yes -- no default -- │ │ create_date_array.force_non_null boolean If set to 'True', raise an error if no True │ │ any of the strings in the array can't │ │ be parsed. │ │ create_date_array.min_index integer The minimum index from where to start no -- no default -- │ │ parsing the string(s). │ │ create_date_array.max_index integer The maximum index until whic to parse no -- no default -- │ │ the string(s). │ │ create_date_array.remove_tokens list A list of tokens/characters to no [] │ │ replace with a single white-space │ │ before parsing the input. │ │ │ │ outputs │ │ field name type description │ │ ────────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ create_date_array.date_array array The resulting array with items of a date data type. │ │ │ │ │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
# Connecting input(s) of step 'create_date_array'
workflow.connect_fields("create_date_array.array", "extract_filename_column.array")
# Creating step: tokenize_content
workflow.add_step(operation="tokenize.texts_array", step_id="tokenize_content")
╭─ Step: tokenize_content ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ │ │ step_id tokenize_content │ │ module type tokenize.texts_array │ │ module doc Split sentences into words or words into characters. │ │ │ │ In other words, this operation establishes the word boundaries (i.e., tokens) a very helpful way of finding patterns. │ │ It is also the typical step prior to stemming and lemmatization │ │ inputs │ │ field name type description Required Default │ │ ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ tokenize_content.texts_array array An array of text items to be yes -- no default -- │ │ tokenized. │ │ tokenize_content.tokenize_by_word boolean Whether to tokenize by word (default), no True │ │ or character. │ │ │ │ outputs │ │ field name type description │ │ ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ tokenize_content.tokens_array array The tokenized content, as an array of lists of strings. │ │ │ │ │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
# Connecting input(s) of step 'tokenize_content'
workflow.connect_fields("tokenize_content.texts_array", "extract_texts_column.array")
# Creating step: preprocess_corpus
workflow.add_step(operation="preprocess.tokens_array", step_id="preprocess_corpus")
╭─ Step: preprocess_corpus ────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ │ │ step_id preprocess_corpus │ │ module type preprocess.tokens_array │ │ module doc Preprocess lists of tokens, incl. lowercasing, remove special characers, etc. │ │ │ │ Lowercasing: Lowercase the words. This operation is a double-edged sword. It can be effective at yielding potentially │ │ better results in the case of relatively small datasets or datatsets with a high percentage of OCR mistakes. For │ │ instance, if lowercasing is not performed, the algorithm will treat USA, Usa, usa, UsA, uSA, etc. as distinct tokens, │ │ even though they may all refer to the same entity. On the other hand, if the dataset does not contain such OCR │ │ mistakes, then it may become difficult to distinguish between homonyms and make interpreting the topics much harder. │ │ │ │ Removing stopwords and words with less than three characters: Remove low information words. These are typically words │ │ such as articles, pronouns, prepositions, conjunctions, etc. which are not semantically salient. There are numerous │ │ stopword lists available for many, though not all, languages which can be easily adapted to the individual │ │ researcher's needs. Removing words with less than three characters may additionally remove many OCR mistakes. Both │ │ these operations have the dual advantage of yielding more reliable results while reducing the size of the dataset, │ │ thus in turn reducing the required processing power. This step can therefore hardly be considered optional in TM. │ │ │ │ Noise removal: Remove elements such as punctuation marks, special characters, numbers, html formatting, etc. This │ │ operation is again concerned with removing elements that may not be relevant to the text analysis and in fact │ │ interfere with it. Depending on the dataset and research question, this operation can become essential. │ │ inputs │ │ field name type description Required Default │ │ ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ preprocess_corpus.tokens_array array The tokens array to pre-process. yes -- no default -- │ │ preprocess_corpus.to_lowercase boolean Apply lowercasing to the text. no False │ │ preprocess_corpus.remove_alphanumer boolean Remove all tokens that include no False │ │ ic numbers (e.g. ex1ample). │ │ preprocess_corpus.remove_non_alpha boolean Remove all tokens that include no False │ │ punctuation and numbers (e.g. │ │ ex1a.mple). │ │ preprocess_corpus.remove_all_numeri boolean Remove all tokens that contain no False │ │ c numbers only (e.g. 876). │ │ preprocess_corpus.remove_short_toke integer Remove tokens shorter than a certain no False │ │ ns length. If value is <= 0, no │ │ filtering will be done. │ │ preprocess_corpus.remove_stopwords list Remove stopwords. no -- no default -- │ │ │ │ outputs │ │ field name type description │ │ ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ preprocess_corpus.tokens_array array The pre-processed content, as an array of lists of strings. │ │ │ │ │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
# Connecting input(s) of step 'preprocess_corpus'
workflow.connect_fields("preprocess_corpus.tokens_array", "tokenize_content.tokens_array")
workflow.connect_fields("preprocess_corpus.remove_stopwords", "create_stopwords_list.stopwords_list")
# Creating step: generate_lda
workflow.add_step(operation="generate.LDA.for.tokens_array", step_id="generate_lda")
╭─ Step: generate_lda ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ │ │ step_id generate_lda │ │ module type generate.LDA.for.tokens_array │ │ module doc Perform Latent Dirichlet Allocation on a tokenized corpus. │ │ │ │ This module computes models for a range of number of topics provided by the user. │ │ inputs │ │ field name type description Required Default │ │ ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ generate_lda.tokens_array array The text corpus. yes -- no default -- │ │ generate_lda.num_topics_min integer The minimal number of topics. no 7 │ │ generate_lda.num_topics_max integer The max number of topics. no -- no default -- │ │ generate_lda.compute_coherence boolean Whether to compute the coherence score no False │ │ for each model. │ │ generate_lda.words_per_topic integer How many words per topic to put in the no 10 │ │ result model. │ │ │ │ outputs │ │ field name type description │ │ ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ generate_lda.topic_models dict A dictionary with one coherence model table for each number of topics. │ │ generate_lda.coherence_table table Coherence details. │ │ generate_lda.coherence_map dict A map with the coherence value for every number of topics. │ │ │ │ │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
# Connecting input(s) of step 'generate_lda'
workflow.connect_fields("generate_lda.tokens_array", "preprocess_corpus.tokens_array")
Setting workflow input/output names (optional)¶
To make our workflow nicer to use, we can set aliases for its inputs and outputs.
workflow.set_input_alias(input_field="extract_texts_column.column_name", alias="content_column_name")
workflow.set_input_alias(input_field="extract_filename_column.column_name", alias="filename_column_name")
workflow.set_input_alias(input_field="import_text_corpus.path", alias="text_corpus_folder_path")
workflow.set_input_alias(input_field="create_date_array.min_index", alias="date_parse_min")
workflow.set_input_alias(input_field="create_date_array.max_index", alias="date_parse_max")
workflow.set_input_alias(input_field="create_date_array.force_non_null", alias="date_force_non_null")
workflow.set_input_alias(input_field="create_date_array.remove_tokens", alias="date_remove_tokensl")
workflow.set_input_alias(input_field="tokenize_content.tokenize_by_word", alias="tokenize_by_word")
workflow.set_input_alias(input_field="generate_lda.num_topics_min", alias="num_topics_min")
workflow.set_input_alias(input_field="generate_lda.num_topics_max", alias="num_topics_max")
workflow.set_input_alias(input_field="generate_lda.compute_coherence", alias="compute_coherence")
workflow.set_input_alias(input_field="generate_lda.words_per_topic", alias="words_per_topic")
workflow.set_input_alias(input_field="create_stopwords_list.languages", alias="languages")
workflow.set_input_alias(input_field="create_stopwords_list.stopword_lists", alias="stopword_lists")
workflow.set_input_alias(input_field="preprocess_corpus.to_lowercase", alias="to_lowercase")
workflow.set_input_alias(input_field="preprocess_corpus.remove_alphanumeric", alias="remove_alphanumeric")
workflow.set_input_alias(input_field="preprocess_corpus.remove_non_alpha", alias="remove_non_alpha")
workflow.set_input_alias(input_field="preprocess_corpus.remove_all_numeric", alias="remove_all_numeric")
workflow.set_input_alias(input_field="preprocess_corpus.remove_short_tokens", alias="remove_short_tokens")
workflow.set_input_alias(input_field="preprocess_corpus.remove_stopwords", alias="remove_stopwords")
workflow.set_output_alias(output_field="import_text_corpus.file_bundle", alias="text_corpus_file_bundle")
workflow.set_output_alias(output_field="create_text_corpus.table", alias="text_corpus_table")
workflow.set_output_alias(output_field="extract_texts_column.array", alias="content_array")
workflow.set_output_alias(output_field="tokenize_content.tokens_array", alias="tokenized_corpus")
workflow.set_output_alias(output_field="preprocess_corpus.tokens_array", alias="preprocessed_corpus")
workflow.set_output_alias(output_field="generate_lda.topic_models", alias="topic_models")
workflow.set_output_alias(output_field="generate_lda.coherence_map", alias="coherence_map")
workflow.set_output_alias(output_field="generate_lda.coherence_table", alias="coherence_table")
workflow.set_output_alias(output_field="create_date_array.date_array", alias="date_array")
Workflow information ¶
After our workflow is wired up, we look can look at its structure, and other properties.
Workflow status¶
A workflow consists of a series of 'states', the most relevant is always the most recent one. We can investigate that latest states details like so:
workflow.current_state
╭──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ │ │ state id zdpuAuzjRbxHaGKvGVJ7rxJQ27Hfy2k7qLTxC3vxowNLwnH4U │ │ pipeline inputs │ │ field name status required default value id │ │ ────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ create_stopwords_list__languages valid no 00000000-0000-0000-0000-000000000001 │ │ create_stopwords_list__stopword_lists valid no 00000000-0000-0000-0000-000000000001 │ │ import_text_corpus__path not set yes 00000000-0000-0000-0000-000000000001 │ │ extract_filename_column__column_name not set yes 00000000-0000-0000-0000-000000000001 │ │ extract_texts_column__column_name not set yes 00000000-0000-0000-0000-000000000001 │ │ create_date_array__force_non_null valid no True 00000000-0000-0000-0000-000000000001 │ │ create_date_array__max_index valid no 00000000-0000-0000-0000-000000000001 │ │ create_date_array__min_index valid no 00000000-0000-0000-0000-000000000001 │ │ create_date_array__remove_tokens valid no [] 00000000-0000-0000-0000-000000000001 │ │ tokenize_content__tokenize_by_word valid no True 00000000-0000-0000-0000-000000000001 │ │ preprocess_corpus__remove_all_numeric valid no False 00000000-0000-0000-0000-000000000001 │ │ preprocess_corpus__remove_alphanumeric valid no False 00000000-0000-0000-0000-000000000001 │ │ preprocess_corpus__remove_non_alpha valid no False 00000000-0000-0000-0000-000000000001 │ │ preprocess_corpus__remove_short_tokens valid no False 00000000-0000-0000-0000-000000000001 │ │ preprocess_corpus__to_lowercase valid no False 00000000-0000-0000-0000-000000000001 │ │ generate_lda__compute_coherence valid no False 00000000-0000-0000-0000-000000000001 │ │ generate_lda__num_topics_max valid no 00000000-0000-0000-0000-000000000001 │ │ generate_lda__num_topics_min valid no 7 00000000-0000-0000-0000-000000000001 │ │ generate_lda__words_per_topic valid no 10 00000000-0000-0000-0000-000000000001 │ │ │ │ steps steps │ │ ├── stage: 1 │ │ │ ├── step: create_stopwords_list │ │ │ │ └── status: inputs ready │ │ │ └── step: import_text_corpus │ │ │ └── status: inputs invalid │ │ │ └── path: not set │ │ ├── stage: 2 │ │ │ └── step: create_text_corpus │ │ │ └── status: inputs invalid │ │ │ └── text_file_bundle: not set │ │ ├── stage: 3 │ │ │ ├── step: extract_filename_column │ │ │ │ └── status: inputs invalid │ │ │ │ ├── table: not set │ │ │ │ └── column_name: not set │ │ │ └── step: extract_texts_column │ │ │ └── status: inputs invalid │ │ │ ├── table: not set │ │ │ └── column_name: not set │ │ ├── stage: 4 │ │ │ ├── step: create_date_array │ │ │ │ └── status: inputs invalid │ │ │ │ └── array: not set │ │ │ └── step: tokenize_content │ │ │ └── status: inputs invalid │ │ │ └── texts_array: not set │ │ ├── stage: 5 │ │ │ └── step: preprocess_corpus │ │ │ └── status: inputs invalid │ │ │ └── tokens_array: not set │ │ └── stage: 6 │ │ └── step: generate_lda │ │ └── status: inputs invalid │ │ └── tokens_array: not set │ │ pipeline outputs │ │ field name status type value id │ │ ─────────────────────────────────────────────────────────────────────────────────────────────── │ │ create_stopwords_list__stopwords_list not set none 00000000-0000-0000-0000-000000000001 │ │ import_text_corpus__file_bundle not set none 00000000-0000-0000-0000-000000000001 │ │ create_text_corpus__table not set none 00000000-0000-0000-0000-000000000001 │ │ extract_filename_column__array not set none 00000000-0000-0000-0000-000000000001 │ │ extract_texts_column__array not set none 00000000-0000-0000-0000-000000000001 │ │ create_date_array__date_array not set none 00000000-0000-0000-0000-000000000001 │ │ tokenize_content__tokens_array not set none 00000000-0000-0000-0000-000000000001 │ │ preprocess_corpus__tokens_array not set none 00000000-0000-0000-0000-000000000001 │ │ generate_lda__coherence_map not set none 00000000-0000-0000-0000-000000000001 │ │ generate_lda__coherence_table not set none 00000000-0000-0000-0000-000000000001 │ │ generate_lda__topic_models not set none 00000000-0000-0000-0000-000000000001 │ │ │ │ │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
Pipeline execution graph¶
Let's look at the current execution graph for the current workflow pipeline:
graph_to_image(workflow.pipeline.execution_graph)
Workflow inputs ¶
Once a workflow has an assembled pipeline, we can set it's inputs. We use the input field
names that we got from the result of the workflow.current_state
call.
workflow.set_input("text_corpus_folder_path", "/home/markus/projects/kiara/dev/kiara.examples/examples/pipelines/topic_modeling/../../data/text_corpus/data")
workflow.set_input("content_column_name", "content")
workflow.set_input("filename_column_name", "file_name")
workflow.set_input("date_force_non_null", None)
workflow.set_input("date_parse_min", 11)
workflow.set_input("date_parse_max", 21)
workflow.set_input("date_remove_tokensl", None)
workflow.set_input("tokenize_by_word", None)
workflow.set_input("languages", ['italian'])
workflow.set_input("stopword_lists", [])
workflow.set_input("to_lowercase", None)
workflow.set_input("remove_alphanumeric", None)
workflow.set_input("remove_non_alpha", None)
workflow.set_input("remove_all_numeric", None)
workflow.set_input("remove_short_tokens", None)
workflow.set_input("num_topics_min", 7)
workflow.set_input("num_topics_max", 9)
workflow.set_input("compute_coherence", True)
workflow.set_input("words_per_topic", None)
# process all workflow steps that can be processed
workflow.process_steps()
# print the current state, after we set our inputs
workflow.current_state
╭──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ │ │ state id zdpuB2sHA2sfuLESmdATMSZm5Bi9TSJS697TGGVuJfvEqGrdT │ │ pipeline inputs │ │ field name status required default value id │ │ ───────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ create_stopwords_list__languages valid no 76394b45-2dcb-44ca-b22b-01e0ee25bd9f │ │ create_stopwords_list__stopword_lists valid no cfd4f58b-358d-4640-af08-b745ffaa7fd3 │ │ import_text_corpus__path valid yes 8721e6c2-9215-4020-93aa-f2fa90eec13b │ │ extract_filename_column__column_name valid yes 0b6bd257-60c6-4434-b66c-e855aa203622 │ │ extract_texts_column__column_name valid yes fd3ef854-c982-4589-b5c4-112aae1c597e │ │ create_date_array__force_non_null valid no True 00000000-0000-0000-0000-000000000002 │ │ create_date_array__max_index valid no a3846cdb-68cb-461a-9635-d0e9bb73681e │ │ create_date_array__min_index valid no 4e8b9a9d-80e0-4595-a244-400746a2215b │ │ create_date_array__remove_tokens valid no [] 00000000-0000-0000-0000-000000000002 │ │ tokenize_content__tokenize_by_word valid no True 00000000-0000-0000-0000-000000000002 │ │ preprocess_corpus__remove_all_numeric valid no False 00000000-0000-0000-0000-000000000002 │ │ preprocess_corpus__remove_alphanumeric valid no False 00000000-0000-0000-0000-000000000002 │ │ preprocess_corpus__remove_non_alpha valid no False 00000000-0000-0000-0000-000000000002 │ │ preprocess_corpus__remove_short_tokens valid no False 00000000-0000-0000-0000-000000000002 │ │ preprocess_corpus__to_lowercase valid no False 00000000-0000-0000-0000-000000000002 │ │ generate_lda__compute_coherence valid no False 0f2ecd3e-15a8-482b-b0e2-20b65d5d4f53 │ │ generate_lda__num_topics_max valid no f9e07551-de8d-4e36-a9f6-cffc01d4c149 │ │ generate_lda__num_topics_min valid no 7 220f512a-03c0-4693-bba7-73142b8522ae │ │ generate_lda__words_per_topic valid no 10 00000000-0000-0000-0000-000000000002 │ │ │ │ steps steps │ │ ├── stage: 1 │ │ │ ├── step: create_stopwords_list │ │ │ │ └── status: results ready │ │ │ └── step: import_text_corpus │ │ │ └── status: results ready │ │ ├── stage: 2 │ │ │ └── step: create_text_corpus │ │ │ └── status: results ready │ │ ├── stage: 3 │ │ │ ├── step: extract_filename_column │ │ │ │ └── status: results ready │ │ │ └── step: extract_texts_column │ │ │ └── status: results ready │ │ ├── stage: 4 │ │ │ ├── step: create_date_array │ │ │ │ └── status: results ready │ │ │ └── step: tokenize_content │ │ │ └── status: results ready │ │ ├── stage: 5 │ │ │ └── step: preprocess_corpus │ │ │ └── status: results ready │ │ └── stage: 6 │ │ └── step: generate_lda │ │ └── status: results ready │ │ pipeline outputs │ │ field name status type value id │ │ ───────────────────────────────────────────────────────────────────────────────────────────────────── │ │ create_stopwords_list__stopwords_list valid list c4c81256-c387-471e-8a05-76890eaefbcd │ │ import_text_corpus__file_bundle valid file_bundle 9856156c-b8dd-46c4-820d-e7f4934f89c9 │ │ create_text_corpus__table valid table 640866c0-dac7-48f3-82d4-8a85617ca072 │ │ extract_filename_column__array valid array b74e8402-ee7c-446f-8f3f-6fe9048d1e7e │ │ extract_texts_column__array valid array 4b002cd8-2e48-456c-8cae-5479c5b1d612 │ │ create_date_array__date_array valid array 4abcfa2e-8ba0-4b04-8414-2a3e5ac38eea │ │ tokenize_content__tokens_array valid array e184c7e5-83c6-4821-ad3a-54b79afff9aa │ │ preprocess_corpus__tokens_array valid array 685323a1-0db9-41b8-a518-c247a089e6ca │ │ generate_lda__coherence_map valid dict c746a2dd-a587-42c1-9967-9753789bf616 │ │ generate_lda__coherence_table valid table 5fd0a91d-7682-4b08-a0a9-f1eb361d60f0 │ │ generate_lda__topic_models valid dict 2c1ad51f-badf-4276-b368-d2e144e361bc │ │ │ │ │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
Workflow outputs ¶
To print the actual data of the workflows' current outputs, we call the current_output_values
property of the workflow object:
workflow.current_output_values
╭──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ │ │ field value │ │ ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ coherence_map │ │ dict data { │ │ "7": 0.23418388139804186, │ │ "8": 0.2320449393569416, │ │ "9": 0.232008834255219 │ │ } │ │ dict schema { │ │ "title": "dict", │ │ "type": "object" │ │ } │ │ │ │ coherence_table │ │ topic_id words num_topics │ │ ──────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ 1 sempre, j, perchè, 1, degenerato, mente, mai, no, solo, re, altri, to, te, egl 7 │ │ 2 no, j, re, perchè, to, 1, sempre, te, egli, degenerato, mente, do, fatto, essi 7 │ │ 3 perchè, 1, sempre, no, j, to, degenerato, egli, te, re, mente, oggi, essi, far 7 │ │ 4 sempre, j, no, altri, perchè, te, 1, degenerato, solo, to, re, fatto, egli, ta 7 │ │ 5 sempre, perchè, j, no, degenerato, te, solo, 1, egli, re, altri, do, essi, ri, 7 │ │ 6 perchè, degenerato, egli, no, j, re, 1, sempre, fatto, te, ta, to, oggi, quel, 7 │ │ 7 j, sempre, degenerato, 1, egli, no, to, oggi, te, re, mente, perchè, quando, e 7 │ │ 1 sempre, perchè, altri, 1, j, te, no, Figli, solo, re, po, nè, degenerato, to, 8 │ │ 2 j, sempre, degenerato, te, re, to, 1, perchè, mai, ta, egli, altri, no, oggi, 8 │ │ 3 sempre, 1, j, perchè, no, degenerato, fare, re, altri, solo, mente, nè, Figli, 8 │ │ 4 no, perchè, sempre, j, degenerato, 1, egli, to, re, te, mente, essi, solo, fat 8 │ │ 5 sempre, perchè, degenerato, 1, egli, j, re, mai, altri, no, do, te, giornale, 8 │ │ 6 sempre, altri, solo, to, j, perchè, mente, 1, re, State, ta, mai, nè, te, Figl 8 │ │ 7 sempre, j, re, perchè, degenerato, 1, egli, no, te, to, mai, mente, solo, oggi 8 │ │ 8 sempre, perchè, te, 1, j, re, egli, degenerato, no, to, fare, ta, oggi, mai, r 8 │ │ 1 sempre, perchè, j, no, altri, degenerato, re, te, 1, mente, essi, mai, egli, t 9 │ │ 2 no, perchè, sempre, j, 1, degenerato, to, egli, oggi, fatto, re, te, solo, mai 9 │ │ 3 sempre, te, perchè, j, no, to, 1, egli, altri, re, mente, mai, degenerato, sol 9 │ │ 4 sempre, j, degenerato, no, 1, re, te, perchè, fare, fatto, to, mente, mai, egl 9 │ │ 5 sempre, no, perchè, 1, j, altri, mente, to, fatto, degenerato, nè, re, solo, e 9 │ │ 6 sempre, egli, j, 1, no, degenerato, perchè, te, essi, quel, fatto, re, ta, ogg 9 │ │ 7 sempre, to, perchè, j, 1, mente, altri, re, solo, essi, te, fare, po, degenera 9 │ │ 8 sempre, perchè, egli, no, j, 1, degenerato, re, te, due, solo, to, altri, ment 9 │ │ 9 perchè, j, sempre, no, degenerato, egli, 1, to, re, te, oggi, giornale, solo, 9 │ │ │ │ content_array │ │ LA RAGIONE │ │ LA RAG ONE │ │ LA RAGIONE │ │ contro i vili, i camorristi, i sicari, i falsari e gli austriacanti, nemici dell ... │ │ contro i vili, i camorristi, i sicari, i falsari e gli austriacanti, nemici dell ... │ │ LA RAGIONA │ │ LA RAGIONE │ │ LA RAGIONE │ │ contro i vili, i camorristi, i sicari, i falsari e gli austriacanti, nemici dell ... │ │ LA RAG ONE │ │ contro 1 vili, i camorristi, i sicari, i falsari e gli austriacanti, nemici dell ... │ │ ■■■ │ │ La Rassegna │ │ Both Phones │ │ ■ jSrìt** W?? iIK 38®f- i^M │ │ ■Both Phones │ │ │ │ date_array │ │ 1917-04-25 00:00:00 │ │ 1917-04-25 00:00:00 │ │ 1917-04-25 00:00:00 │ │ 1917-04-25 00:00:00 │ │ 1917-05-05 00:00:00 │ │ 1917-05-05 00:00:00 │ │ 1917-05-05 00:00:00 │ │ 1917-05-05 00:00:00 │ │ 1917-05-16 00:00:00 │ │ 1917-05-16 00:00:00 │ │ 1917-05-16 00:00:00 │ │ 1917-04-07 00:00:00 │ │ 1917-04-14 00:00:00 │ │ 1917-04-14 00:00:00 │ │ 1917-04-21 00:00:00 │ │ 1917-04-21 00:00:00 │ │ │ │ preprocessed_corpus │ │ ['RAGIONE', 'ORGANO', 'DIFESA', "ITALIANITÀ'", '1', 'vili', ',', 'camorristi', ' ... │ │ ['RAG', 'ONE', 'vili', ',', 'camorristi', ',', 'sicari', ',', 'falsari', 'austri ... │ │ ['RAGIONE', 'ORGANO', 'DIFESA', 'ITALIANITÀ', 'vili', ',', 'camorristi', ',', 's ... │ │ ['vili', ',', 'camorristi', ',', 'sicari', ',', 'falsari', 'austriacanti', ',', ... │ │ ['vili', ',', 'camorristi', ',', 'sicari', ',', 'falsari', 'austriacanti', ',', ... │ │ ['RAGIONA', 'ORGANO', 'DIFESA', 'ITALIANITÀ', 'vili', ',', 'camorristi', ',', 's ... │ │ ['RAGIONE', 'ORGANO', 'DIFESA', "ITALIANITÀ'", 'vili', ',', 'camorristi', ',', ' ... │ │ ['RAGIONE', 'vili', ',', '1', 'camorristi', ',', 'sicari', ',', 'falsari', 'aust ... │ │ ['vili', ',', 'camorristi', ',', 'sicari', ',', 'falsari', 'austriacanti', ',', ... │ │ ['RAG', 'ONE', 'ORGANO', 'DIFESA', 'ITALIANITÀ', "''", 'vili', ',', 'camorristi' ... │ │ ['1', 'vili', ',', 'camorristi', ',', 'sicari', ',', 'falsari', 'austriacanti', ... │ │ ['■■■', 'Rassegna', '_', 'Both', 'Phones', 'ANNO', 'No', '.', '1', 'perche', "'" ... │ │ ['Rassegna', 'Jjoth', 'Phones', 'ANNO', 'No', '.', '2', 'BASTA', '!', '...', 'qu ... │ │ ['Both', 'Phones', 'ANNO', '.', 'No', '.', '2', 'BASTA', '!', '...', 'uà', 'quai ... │ │ ['■', 'jSrìt', '*', '*', 'W', '?', '?', 'iIK', '38®f-', 'i^M', 'F', '<', '5É', ' ... │ │ ['■Both', 'Phones', 'ANNO', '11', '.', 'No', '.', '5', 'COSE', 'POSTO', 'va', 'd ... │ │ │ │ text_corpus_file_bundle │ │ bundle name data │ │ number_of_files 16 │ │ size 298452 │ │ included files │ │ (relative) path size │ │ ────────────────────────────────────────────────────────────── │ │ La_Ragione/sn84037024_1917-04-25_ed-1_seq-1_ocr.txt 16613 │ │ La_Ragione/sn84037024_1917-04-25_ed-2_seq-1_ocr.txt 16679 │ │ La_Ragione/sn84037024_1917-04-25_ed-3_seq-1_ocr.txt 16793 │ │ La_Ragione/sn84037024_1917-04-25_ed-4_seq-1_ocr.txt 16235 │ │ La_Ragione/sn84037024_1917-05-05_ed-1_seq-1_ocr.txt 18346 │ │ La_Ragione/sn84037024_1917-05-05_ed-2_seq-1_ocr.txt 18474 │ │ La_Ragione/sn84037024_1917-05-05_ed-3_seq-1_ocr.txt 18280 │ │ La_Ragione/sn84037024_1917-05-05_ed-4_seq-1_ocr.txt 18481 │ │ La_Ragione/sn84037024_1917-05-16_ed-1_seq-1_ocr.txt 18620 │ │ La_Ragione/sn84037024_1917-05-16_ed-2_seq-1_ocr.txt 18698 │ │ La_Ragione/sn84037024_1917-05-16_ed-3_seq-1_ocr.txt 18540 │ │ La_Rassegna/sn84037025_1917-04-07_ed-1_seq-1_ocr.txt 19397 │ │ La_Rassegna/sn84037025_1917-04-14_ed-1_seq-1_ocr.txt 20647 │ │ La_Rassegna/sn84037025_1917-04-14_ed-2_seq-1_ocr.txt 20650 │ │ La_Rassegna/sn84037025_1917-04-21_ed-1_seq-1_ocr.txt 21017 │ │ La_Rassegna/sn84037025_1917-04-21_ed-2_seq-1_ocr.txt 20982 │ │ │ │ │ │ text_corpus_table │ │ id rel_path mime_type size content file_name │ │ ──────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ 0 La_Ragione/sn84037024_1 text/plain 16613 LA RAGIONE sn84037024_1917-04-25_e │ │ 1 La_Ragione/sn84037024_1 text/plain 16679 LA RAG ONE sn84037024_1917-04-25_e │ │ 2 La_Ragione/sn84037024_1 text/plain 16793 LA RAGIONE sn84037024_1917-04-25_e │ │ 3 La_Ragione/sn84037024_1 text/plain 16235 contro i vili, i camorri sn84037024_1917-04-25_e │ │ 4 La_Ragione/sn84037024_1 text/plain 18346 contro i vili, i camorri sn84037024_1917-05-05_e │ │ 5 La_Ragione/sn84037024_1 text/plain 18474 LA RAGIONA sn84037024_1917-05-05_e │ │ 6 La_Ragione/sn84037024_1 text/plain 18280 LA RAGIONE sn84037024_1917-05-05_e │ │ 7 La_Ragione/sn84037024_1 text/plain 18481 LA RAGIONE sn84037024_1917-05-05_e │ │ 8 La_Ragione/sn84037024_1 text/plain 18620 contro i vili, i camorri sn84037024_1917-05-16_e │ │ 9 La_Ragione/sn84037024_1 text/plain 18698 LA RAG ONE sn84037024_1917-05-16_e │ │ 10 La_Ragione/sn84037024_1 text/plain 18540 contro 1 vili, i camorri sn84037024_1917-05-16_e │ │ 11 La_Rassegna/sn84037025_ text/plain 19397 ■■■ sn84037025_1917-04-07_e │ │ 12 La_Rassegna/sn84037025_ text/plain 20647 La Rassegna sn84037025_1917-04-14_e │ │ 13 La_Rassegna/sn84037025_ text/plain 20650 Both Phones sn84037025_1917-04-14_e │ │ 14 La_Rassegna/sn84037025_ text/plain 21017 ■ jSrìt** W?? iIK 38®f- sn84037025_1917-04-21_e │ │ 15 La_Rassegna/sn84037025_ text/plain 20982 ■Both Phones sn84037025_1917-04-21_e │ │ │ │ tokenized_corpus │ │ ['LA', 'RAGIONE', 'ORGANO', 'DI', 'DIFESA', 'DELLA', "ITALIANITÀ'", 'contro', '1 ... │ │ ['LA', 'RAG', 'ONE', 'contro', 'i', 'vili', ',', 'i', 'camorristi', ',', 'i', 's ... │ │ ['LA', 'RAGIONE', 'ORGANO', 'DI', 'DIFESA', 'DELLA', 'ITALIANITÀ', 'contro', 'i' ... │ │ ['contro', 'i', 'vili', ',', 'i', 'camorristi', ',', 'i', 'sicari', ',', 'i', 'f ... │ │ ['contro', 'i', 'vili', ',', 'i', 'camorristi', ',', 'i', 'sicari', ',', 'i', 'f ... │ │ ['LA', 'RAGIONA', 'ORGANO', 'DI', 'DIFESA', 'DELLA', 'ITALIANITÀ', 'contro', 'i' ... │ │ ['LA', 'RAGIONE', 'ORGANO', 'DI', 'DIFESA', 'DELLA', "ITALIANITÀ'", 'contro', 'i ... │ │ ['LA', 'RAGIONE', 'contro', 'i', 'vili', ',', '1', 'camorristi', ',', 'i', 'sica ... │ │ ['contro', 'i', 'vili', ',', 'i', 'camorristi', ',', 'i', 'sicari', ',', 'i', 'f ... │ │ ['LA', 'RAG', 'ONE', 'ORGANO', 'DI', 'DIFESA', 'DELLA', 'ITALIANITÀ', "''", 'con ... │ │ ['contro', '1', 'vili', ',', 'i', 'camorristi', ',', 'i', 'sicari', ',', 'i', 'f ... │ │ ['■■■', 'La', 'Rassegna', '_', 'I', 'Both', 'Phones', 'ANNO', 'L', 'No', '.', '1 ... │ │ ['La', 'Rassegna', 'Jjoth', 'Phones', 'ANNO', 'L', 'No', '.', '2', 'BASTA', '!', ... │ │ ['Both', 'Phones', 'ANNO', 'I', '.', 'No', '.', '2', 'BASTA', '!', '...', 'uà', ... │ │ ['■', 'jSrìt', '*', '*', 'W', '?', '?', 'iIK', '38®f-', 'i^M', 'F', '<', '5É', ' ... │ │ ['■Both', 'Phones', 'ANNO', '11', '.', 'No', '.', '5', 'LE', 'COSE', 'A', 'POSTO ... │ │ │ │ topic_models │ │ dict data { │ │ "7": [ │ │ [ │ │ 0, │ │ "0.085*\",\" + 0.028*\".\" + 0.012*\";\" + 0.006*\"'\" + 0.005*\"!\" + 0.004*\"''… │ │ ], │ │ [ │ │ 1, │ │ "0.089*\",\" + 0.035*\".\" + 0.010*\";\" + 0.006*\"!\" + 0.006*\"'\" + 0.004*\":\… │ │ ], │ │ [ │ │ 2, │ │ "0.101*\",\" + 0.031*\".\" + 0.012*\";\" + 0.006*\"'\" + 0.006*\"!\" + 0.005*\"*\… │ │ ], │ │ [ │ │ 3, │ │ "0.067*\",\" + 0.032*\".\" + 0.013*\";\" + 0.007*\"'\" + 0.006*\"!\" + 0.004*\":\… │ │ ], │ │ [ │ │ 4, │ │ "0.075*\",\" + 0.037*\".\" + 0.013*\";\" + 0.008*\"!\" + 0.006*\"'\" + 0.004*\"se… │ │ ], │ │ [ │ │ 5, │ │ "0.073*\",\" + 0.030*\".\" + 0.014*\";\" + 0.007*\"!\" + 0.004*\"'\" + 0.003*\"pe… │ │ ], │ │ [ │ │ 6, │ │ "0.069*\",\" + 0.026*\".\" + 0.007*\";\" + 0.007*\"!\" + 0.006*\"'\" + 0.004*\"''… │ │ ] │ │ ], │ │ "8": [ │ │ [ │ │ 0, │ │ "0.028*\",\" + 0.010*\".\" + 0.004*\";\" + 0.004*\"'\" + 0.003*\"!\" + 0.003*\":\… │ │ ], │ │ [ │ │ 1, │ │ "0.069*\",\" + 0.025*\".\" + 0.013*\";\" + 0.008*\"!\" + 0.005*\"'\" + 0.004*\":\… │ │ ], │ │ [ │ │ 2, │ │ "0.056*\",\" + 0.027*\".\" + 0.012*\";\" + 0.007*\"'\" + 0.005*\"!\" + 0.005*\":\… │ │ ], │ │ [ │ │ 3, │ │ "0.110*\",\" + 0.040*\".\" + 0.012*\";\" + 0.006*\"!\" + 0.006*\"'\" + 0.005*\"*\… │ │ ], │ │ [ │ │ 4, │ │ "0.029*\",\" + 0.017*\".\" + 0.006*\";\" + 0.004*\"!\" + 0.003*\"'\" + 0.002*\":\… │ │ ], │ │ [ │ │ 5, │ │ "0.015*\",\" + 0.010*\".\" + 0.004*\";\" + 0.003*\"'\" + 0.002*\"!\" + 0.002*\"''… │ │ ], │ │ [ │ │ 6, │ │ "0.057*\",\" + 0.033*\".\" + 0.013*\";\" + 0.008*\"!\" + 0.007*\"'\" + 0.005*\"se… │ │ ], │ │ [ │ │ 7, │ │ "0.077*\",\" + 0.022*\".\" + 0.012*\";\" + 0.007*\"!\" + 0.006*\"'\" + 0.004*\"''… │ │ ] │ │ ], │ │ "9": [ │ │ [ │ │ 0, │ │ "0.064*\",\" + 0.023*\".\" + 0.010*\";\" + 0.006*\"!\" + 0.004*\"'\" + 0.004*\"*\… │ │ ], │ │ [ │ │ 1, │ │ "0.049*\",\" + 0.032*\".\" + 0.009*\";\" + 0.005*\"'\" + 0.004*\"!\" + 0.003*\"``… │ │ ], │ │ [ │ │ 2, │ │ "0.079*\",\" + 0.020*\".\" + 0.010*\";\" + 0.004*\":\" + 0.004*\"'\" + 0.004*\"se… │ │ ], │ │ [ │ │ 3, │ │ "0.109*\",\" + 0.030*\".\" + 0.015*\";\" + 0.007*\"!\" + 0.007*\"'\" + 0.004*\":\… │ │ ], │ │ [ │ │ 4, │ │ "0.051*\",\" + 0.030*\".\" + 0.010*\";\" + 0.005*\"''\" + 0.004*\"'\" + 0.004*\"!… │ │ ], │ │ [ │ │ 5, │ │ "0.047*\",\" + 0.016*\".\" + 0.008*\";\" + 0.005*\"!\" + 0.003*\"''\" + 0.003*\"'… │ │ ], │ │ [ │ │ 6, │ │ "0.074*\",\" + 0.030*\".\" + 0.010*\";\" + 0.006*\"!\" + 0.005*\"''\" + 0.005*\"'… │ │ ], │ │ [ │ │ 7, │ │ "0.079*\",\" + 0.039*\".\" + 0.012*\";\" + 0.007*\"!\" + 0.006*\"'\" + 0.004*\":\… │ │ ], │ │ [ │ │ 8, │ │ "0.094*\",\" + 0.041*\".\" + 0.012*\";\" + 0.007*\"'\" + 0.007*\"!\" + 0.004*\"*\… │ │ ] │ │ ] │ │ } │ │ dict schema { │ │ "title": "dict", │ │ "type": "object" │ │ } │ │ │ │ │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
Workflow snapshot ¶
So far, our workflow only exists in memory. If we want to save it so we can have a look at it again at a later stage, we can snapshot the current state, which will save the current structure of the internal pipeline, as well as all inputs that are currently used. In addition, this will register the workflow under the alias we specified on top of this file when creating the Workflow
object (in our case: topic_modeling
).
If we would not not specify save=True
, the structure of the pipeline and inputs would still be frozen and kept, but only in memory, and we'd only be able to access it in our current session.
workflow.snapshot(save=True)
╭──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ │ │ state id zdpuB2sHA2sfuLESmdATMSZm5Bi9TSJS697TGGVuJfvEqGrdT │ │ pipeline inputs │ │ field name status required default value id │ │ ───────────────────────────────────────────────────────────────────────────────────────────────────────────── │ │ create_stopwords_list__languages valid no 76394b45-2dcb-44ca-b22b-01e0ee25bd9f │ │ create_stopwords_list__stopword_lists valid no cfd4f58b-358d-4640-af08-b745ffaa7fd3 │ │ import_text_corpus__path valid yes 8721e6c2-9215-4020-93aa-f2fa90eec13b │ │ extract_filename_column__column_name valid yes 0b6bd257-60c6-4434-b66c-e855aa203622 │ │ extract_texts_column__column_name valid yes fd3ef854-c982-4589-b5c4-112aae1c597e │ │ create_date_array__force_non_null valid no True 00000000-0000-0000-0000-000000000002 │ │ create_date_array__max_index valid no a3846cdb-68cb-461a-9635-d0e9bb73681e │ │ create_date_array__min_index valid no 4e8b9a9d-80e0-4595-a244-400746a2215b │ │ create_date_array__remove_tokens valid no [] 00000000-0000-0000-0000-000000000002 │ │ tokenize_content__tokenize_by_word valid no True 00000000-0000-0000-0000-000000000002 │ │ preprocess_corpus__remove_all_numeric valid no False 00000000-0000-0000-0000-000000000002 │ │ preprocess_corpus__remove_alphanumeric valid no False 00000000-0000-0000-0000-000000000002 │ │ preprocess_corpus__remove_non_alpha valid no False 00000000-0000-0000-0000-000000000002 │ │ preprocess_corpus__remove_short_tokens valid no False 00000000-0000-0000-0000-000000000002 │ │ preprocess_corpus__to_lowercase valid no False 00000000-0000-0000-0000-000000000002 │ │ generate_lda__compute_coherence valid no False 0f2ecd3e-15a8-482b-b0e2-20b65d5d4f53 │ │ generate_lda__num_topics_max valid no f9e07551-de8d-4e36-a9f6-cffc01d4c149 │ │ generate_lda__num_topics_min valid no 7 220f512a-03c0-4693-bba7-73142b8522ae │ │ generate_lda__words_per_topic valid no 10 00000000-0000-0000-0000-000000000002 │ │ │ │ steps steps │ │ ├── stage: 1 │ │ │ ├── step: create_stopwords_list │ │ │ │ └── status: results ready │ │ │ └── step: import_text_corpus │ │ │ └── status: results ready │ │ ├── stage: 2 │ │ │ └── step: create_text_corpus │ │ │ └── status: results ready │ │ ├── stage: 3 │ │ │ ├── step: extract_filename_column │ │ │ │ └── status: results ready │ │ │ └── step: extract_texts_column │ │ │ └── status: results ready │ │ ├── stage: 4 │ │ │ ├── step: create_date_array │ │ │ │ └── status: results ready │ │ │ └── step: tokenize_content │ │ │ └── status: results ready │ │ ├── stage: 5 │ │ │ └── step: preprocess_corpus │ │ │ └── status: results ready │ │ └── stage: 6 │ │ └── step: generate_lda │ │ └── status: results ready │ │ pipeline outputs │ │ field name status type value id │ │ ───────────────────────────────────────────────────────────────────────────────────────────────────── │ │ create_stopwords_list__stopwords_list valid list c4c81256-c387-471e-8a05-76890eaefbcd │ │ import_text_corpus__file_bundle valid file_bundle 9856156c-b8dd-46c4-820d-e7f4934f89c9 │ │ create_text_corpus__table valid table 640866c0-dac7-48f3-82d4-8a85617ca072 │ │ extract_filename_column__array valid array b74e8402-ee7c-446f-8f3f-6fe9048d1e7e │ │ extract_texts_column__array valid array 4b002cd8-2e48-456c-8cae-5479c5b1d612 │ │ create_date_array__date_array valid array 4abcfa2e-8ba0-4b04-8414-2a3e5ac38eea │ │ tokenize_content__tokens_array valid array e184c7e5-83c6-4821-ad3a-54b79afff9aa │ │ preprocess_corpus__tokens_array valid array 685323a1-0db9-41b8-a518-c247a089e6ca │ │ generate_lda__coherence_map valid dict c746a2dd-a587-42c1-9967-9753789bf616 │ │ generate_lda__coherence_table valid table 5fd0a91d-7682-4b08-a0a9-f1eb361d60f0 │ │ generate_lda__topic_models valid dict 2c1ad51f-badf-4276-b368-d2e144e361bc │ │ │ │ │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
Now, we can access our workflow in other environments, for example from the commandline:
! kiara workflow list
alias(es) workflow_id # steps # stages # states description
─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
topic_modeling 1a0f56d8-bfb6-459a-b7ad-ad52ea1828ff 9 6 1 Example topic-modeling end-to-end workflow.
! kiara workflow explain topic_modeling
╭─ Workflow: topic_modeling ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ │
│ documentation │
│ Example topic-modeling end-to-end workflow. │
│ │
│ author(s) │
│ │
│ workflow id 1a0f56d8-bfb6-459a-b7ad-ad52ea1828ff │
│ context │
│ │
│ current aliases {"inputs":{"extract_texts_column__column_name":"content_column_name","extract_filename_column__column_name":"fi… │
│ current inputs │
│ field value │
│ ────────────────────────────────────────────────────────────────────────────────────────────────────────────── │
│ compute_coherence True │
│ content_column_name content │
│ date_force_non_null -- none/not set -- │
│ date_parse_max 21 │
│ date_parse_min 11 │
│ date_remove_tokensl -- none/not set -- │
│ filename_column_name file_name │
│ languages │
│ Field Type Value Description │
│ ──────────────────────────────────────────────────────────────────────────────── │
│ item_schema object { The schema. │
│ "title": "list", │
│ "type": "object" │
│ } │
│ │
│ list_data array italian The data. │
│ │
│ python_class -- check source -- The python class of │
│ pytho… list which model │
│ pytho… built… instances are │
│ full_… list created. This is │
│ mostly meant as a │
│ hint for client │
│ applications. │
│ │
│ num_topics_max 9 │
│ num_topics_min 7 │
│ remove_all_numeric -- none/not set -- │
│ remove_alphanumeric -- none/not set -- │
│ remove_non_alpha -- none/not set -- │
│ remove_short_tokens -- none/not set -- │
│ stopword_lists │
│ Field Type Value Description │
│ ──────────────────────────────────────────────────────────────────────────────── │
│ item_schema object { The schema. │
│ "title": "list", │
│ "type": "object" │
│ } │
│ │
│ list_data array The data. │
│ │
│ python_class -- check source -- The python class of │
│ pytho… list which model │
│ pytho… built… instances are │
│ full_… list created. This is │
│ mostly meant as a │
│ hint for client │
│ applications. │
│ │
│ text_corpus_folder_path /home/markus/projects/kiara/dev/kiara.examples/examples/pipelines/topic_modeling/… │
│ to_lowercase -- none/not set -- │
│ tokenize_by_word -- none/not set -- │
│ words_per_topic -- none/not set -- │
│ │
│ current outputs │
│ field value │
│ ────────────────────────────────────────────────────────────────────────────────────────────────────────────── │
│ coherence_map │
│ dict data { │
│ "7": 0.23418388139804186, │
│ "8": 0.2320449393569416, │
│ "9": 0.232008834255219 │
│ } │
│ dict schema { │
│ "title": "dict", │
│ "type": "object" │
│ } │
│ │
│ coherence_table │
│ topic_id words num_topics │
│ ──────────────────────────────────────────────────────────────────────────────── │
│ 1 sempre, j, perchè, 1, degenerato, mente, mai, no, solo 7 │
│ 2 no, j, re, perchè, to, 1, sempre, te, egli, degenerato 7 │
│ 3 perchè, 1, sempre, no, j, to, degenerato, egli, te, re 7 │
│ 4 sempre, j, no, altri, perchè, te, 1, degenerato, solo, 7 │
│ 5 sempre, perchè, j, no, degenerato, te, solo, 1, egli, 7 │
│ 6 perchè, degenerato, egli, no, j, re, 1, sempre, fatto, 7 │
│ 7 j, sempre, degenerato, 1, egli, no, to, oggi, te, re, 7 │
│ 1 sempre, perchè, altri, 1, j, te, no, Figli, solo, re, 8 │
│ 2 j, sempre, degenerato, te, re, to, 1, perchè, mai, ta, 8 │
│ 3 sempre, 1, j, perchè, no, degenerato, fare, re, altri, 8 │
│ 4 no, perchè, sempre, j, degenerato, 1, egli, to, re, te 8 │
│ 5 sempre, perchè, degenerato, 1, egli, j, re, mai, altri 8 │
│ 6 sempre, altri, solo, to, j, perchè, mente, 1, re, Stat 8 │
│ 7 sempre, j, re, perchè, degenerato, 1, egli, no, te, to 8 │
│ 8 sempre, perchè, te, 1, j, re, egli, degenerato, no, to 8 │
│ 1 sempre, perchè, j, no, altri, degenerato, re, te, 1, m 9 │
│ 2 no, perchè, sempre, j, 1, degenerato, to, egli, oggi, 9 │
│ 3 sempre, te, perchè, j, no, to, 1, egli, altri, re, men 9 │
│ 4 sempre, j, degenerato, no, 1, re, te, perchè, fare, fa 9 │
│ 5 sempre, no, perchè, 1, j, altri, mente, to, fatto, deg 9 │
│ 6 sempre, egli, j, 1, no, degenerato, perchè, te, essi, 9 │
│ 7 sempre, to, perchè, j, 1, mente, altri, re, solo, essi 9 │
│ 8 sempre, perchè, egli, no, j, 1, degenerato, re, te, du 9 │
│ 9 perchè, j, sempre, no, degenerato, egli, 1, to, re, te 9 │
│ │
│ content_array │
│ LA RAGIONE │
│ LA RAG ONE │
│ LA RAGIONE │
│ contro i vili, i camorristi, i sicari, i falsari e gli austriacanti, nemici de │
│ contro i vili, i camorristi, i sicari, i falsari e gli austriacanti, nemici de │
│ LA RAGIONA │
│ LA RAGIONE │
│ LA RAGIONE │
│ contro i vili, i camorristi, i sicari, i falsari e gli austriacanti, nemici de │
│ LA RAG ONE │
│ contro 1 vili, i camorristi, i sicari, i falsari e gli austriacanti, nemici de │
│ ■■■ │
│ La Rassegna │
│ Both Phones │
│ ■ jSrìt** W?? iIK 38®f- i^M │
│ ■Both Phones │
│ │
│ date_array │
│ 1917-04-25 00:00:00 │
│ 1917-04-25 00:00:00 │
│ 1917-04-25 00:00:00 │
│ 1917-04-25 00:00:00 │
│ 1917-05-05 00:00:00 │
│ 1917-05-05 00:00:00 │
│ 1917-05-05 00:00:00 │
│ 1917-05-05 00:00:00 │
│ 1917-05-16 00:00:00 │
│ 1917-05-16 00:00:00 │
│ 1917-05-16 00:00:00 │
│ 1917-04-07 00:00:00 │
│ 1917-04-14 00:00:00 │
│ 1917-04-14 00:00:00 │
│ 1917-04-21 00:00:00 │
│ 1917-04-21 00:00:00 │
│ │
│ preprocessed_corpus │
│ ['RAGIONE', 'ORGANO', 'DIFESA', "ITALIANITÀ'", '1', 'vili', ',', 'camorristi', │
│ ['RAG', 'ONE', 'vili', ',', 'camorristi', ',', 'sicari', ',', 'falsari', 'aust │
│ ['RAGIONE', 'ORGANO', 'DIFESA', 'ITALIANITÀ', 'vili', ',', 'camorristi', ',', │
│ ['vili', ',', 'camorristi', ',', 'sicari', ',', 'falsari', 'austriacanti', ',' │
│ ['vili', ',', 'camorristi', ',', 'sicari', ',', 'falsari', 'austriacanti', ',' │
│ ['RAGIONA', 'ORGANO', 'DIFESA', 'ITALIANITÀ', 'vili', ',', 'camorristi', ',', │
│ ['RAGIONE', 'ORGANO', 'DIFESA', "ITALIANITÀ'", 'vili', ',', 'camorristi', ',', │
│ ['RAGIONE', 'vili', ',', '1', 'camorristi', ',', 'sicari', ',', 'falsari', 'au │
│ ['vili', ',', 'camorristi', ',', 'sicari', ',', 'falsari', 'austriacanti', ',' │
│ ['RAG', 'ONE', 'ORGANO', 'DIFESA', 'ITALIANITÀ', "''", 'vili', ',', 'camorrist │
│ ['1', 'vili', ',', 'camorristi', ',', 'sicari', ',', 'falsari', 'austriacanti' │
│ ['■■■', 'Rassegna', '_', 'Both', 'Phones', 'ANNO', 'No', '.', '1', 'perche', " │
│ ['Rassegna', 'Jjoth', 'Phones', 'ANNO', 'No', '.', '2', 'BASTA', '!', '...', ' │
│ ['Both', 'Phones', 'ANNO', '.', 'No', '.', '2', 'BASTA', '!', '...', 'uà', 'qu │
│ ['■', 'jSrìt', '*', '*', 'W', '?', '?', 'iIK', '38®f-', 'i^M', 'F', '<', '5É', │
│ ['■Both', 'Phones', 'ANNO', '11', '.', 'No', '.', '5', 'COSE', 'POSTO', 'va', │
│ │
│ text_corpus_file_bundle │
│ bundle name data │
│ number_of_files 16 │
│ size 298452 │
│ included files │
│ (relative) path size │
│ ────────────────────────────────────────────────────────── │
│ La_Ragione/sn84037024_1917-04-25_ed-1_seq-1_ocr… 16613 │
│ La_Ragione/sn84037024_1917-04-25_ed-2_seq-1_ocr… 16679 │
│ La_Ragione/sn84037024_1917-04-25_ed-3_seq-1_ocr… 16793 │
│ La_Ragione/sn84037024_1917-04-25_ed-4_seq-1_ocr… 16235 │
│ La_Ragione/sn84037024_1917-05-05_ed-1_seq-1_ocr… 18346 │
│ La_Ragione/sn84037024_1917-05-05_ed-2_seq-1_ocr… 18474 │
│ La_Ragione/sn84037024_1917-05-05_ed-3_seq-1_ocr… 18280 │
│ La_Ragione/sn84037024_1917-05-05_ed-4_seq-1_ocr… 18481 │
│ La_Ragione/sn84037024_1917-05-16_ed-1_seq-1_ocr… 18620 │
│ La_Ragione/sn84037024_1917-05-16_ed-2_seq-1_ocr… 18698 │
│ La_Ragione/sn84037024_1917-05-16_ed-3_seq-1_ocr… 18540 │
│ La_Rassegna/sn84037025_1917-04-07_ed-1_seq-1_oc… 19397 │
│ La_Rassegna/sn84037025_1917-04-14_ed-1_seq-1_oc… 20647 │
│ La_Rassegna/sn84037025_1917-04-14_ed-2_seq-1_oc… 20650 │
│ La_Rassegna/sn84037025_1917-04-21_ed-1_seq-1_oc… 21017 │
│ La_Rassegna/sn84037025_1917-04-21_ed-2_seq-1_oc… 20982 │
│ │
│ │
│ text_corpus_table │
│ id rel_path mime_type size content file_name │
│ ──────────────────────────────────────────────────────────────────────────────── │
│ 0 La_Ragione/sn84 text/plain 16613 LA RAGIONE sn84037024_1917 │
│ 1 La_Ragione/sn84 text/plain 16679 LA RAG ONE sn84037024_1917 │
│ 2 La_Ragione/sn84 text/plain 16793 LA RAGIONE sn84037024_1917 │
│ 3 La_Ragione/sn84 text/plain 16235 contro i vili, i sn84037024_1917 │
│ 4 La_Ragione/sn84 text/plain 18346 contro i vili, i sn84037024_1917 │
│ 5 La_Ragione/sn84 text/plain 18474 LA RAGIONA sn84037024_1917 │
│ 6 La_Ragione/sn84 text/plain 18280 LA RAGIONE sn84037024_1917 │
│ 7 La_Ragione/sn84 text/plain 18481 LA RAGIONE sn84037024_1917 │
│ 8 La_Ragione/sn84 text/plain 18620 contro i vili, i sn84037024_1917 │
│ 9 La_Ragione/sn84 text/plain 18698 LA RAG ONE sn84037024_1917 │
│ 10 La_Ragione/sn84 text/plain 18540 contro 1 vili, i sn84037024_1917 │
│ 11 La_Rassegna/sn8 text/plain 19397 ■■■ sn84037025_1917 │
│ 12 La_Rassegna/sn8 text/plain 20647 La Rassegna sn84037025_1917 │
│ 13 La_Rassegna/sn8 text/plain 20650 Both Phones sn84037025_1917 │
│ 14 La_Rassegna/sn8 text/plain 21017 ■ jSrìt** W?? iI sn84037025_1917 │
│ 15 La_Rassegna/sn8 text/plain 20982 ■Both Phones sn84037025_1917 │
│ │
│ tokenized_corpus │
│ ['LA', 'RAGIONE', 'ORGANO', 'DI', 'DIFESA', 'DELLA', "ITALIANITÀ'", 'contro', │
│ ['LA', 'RAG', 'ONE', 'contro', 'i', 'vili', ',', 'i', 'camorristi', ',', 'i', │
│ ['LA', 'RAGIONE', 'ORGANO', 'DI', 'DIFESA', 'DELLA', 'ITALIANITÀ', 'contro', ' │
│ ['contro', 'i', 'vili', ',', 'i', 'camorristi', ',', 'i', 'sicari', ',', 'i', │
│ ['contro', 'i', 'vili', ',', 'i', 'camorristi', ',', 'i', 'sicari', ',', 'i', │
│ ['LA', 'RAGIONA', 'ORGANO', 'DI', 'DIFESA', 'DELLA', 'ITALIANITÀ', 'contro', ' │
│ ['LA', 'RAGIONE', 'ORGANO', 'DI', 'DIFESA', 'DELLA', "ITALIANITÀ'", 'contro', │
│ ['LA', 'RAGIONE', 'contro', 'i', 'vili', ',', '1', 'camorristi', ',', 'i', 'si │
│ ['contro', 'i', 'vili', ',', 'i', 'camorristi', ',', 'i', 'sicari', ',', 'i', │
│ ['LA', 'RAG', 'ONE', 'ORGANO', 'DI', 'DIFESA', 'DELLA', 'ITALIANITÀ', "''", 'c │
│ ['contro', '1', 'vili', ',', 'i', 'camorristi', ',', 'i', 'sicari', ',', 'i', │
│ ['■■■', 'La', 'Rassegna', '_', 'I', 'Both', 'Phones', 'ANNO', 'L', 'No', '.', │
│ ['La', 'Rassegna', 'Jjoth', 'Phones', 'ANNO', 'L', 'No', '.', '2', 'BASTA', '! │
│ ['Both', 'Phones', 'ANNO', 'I', '.', 'No', '.', '2', 'BASTA', '!', '...', 'uà' │
│ ['■', 'jSrìt', '*', '*', 'W', '?', '?', 'iIK', '38®f-', 'i^M', 'F', '<', '5É', │
│ ['■Both', 'Phones', 'ANNO', '11', '.', 'No', '.', '5', 'LE', 'COSE', 'A', 'POS │
│ │
│ topic_models │
│ dict data { │
│ "7": [ │
│ [ │
│ 0, │
│ "0.085*\",\" + 0.028*\".\" + 0.012*\";\" + 0.006*\"'\" + … │
│ ], │
│ [ │
│ 1, │
│ "0.089*\",\" + 0.035*\".\" + 0.010*\";\" + 0.006*\"!\" + … │
│ ], │
│ [ │
│ 2, │
│ "0.101*\",\" + 0.031*\".\" + 0.012*\";\" + 0.006*\"'\" + … │
│ ], │
│ [ │
│ 3, │
│ "0.067*\",\" + 0.032*\".\" + 0.013*\";\" + 0.007*\"'\" + … │
│ ], │
│ [ │
│ 4, │
│ "0.075*\",\" + 0.037*\".\" + 0.013*\";\" + 0.008*\"!\" + … │
│ ], │
│ [ │
│ 5, │
│ "0.073*\",\" + 0.030*\".\" + 0.014*\";\" + 0.007*\"!\" + … │
│ ], │
│ [ │
│ 6, │
│ "0.069*\",\" + 0.026*\".\" + 0.007*\";\" + 0.007*\"!\" + … │
│ ] │
│ ], │
│ "8": [ │
│ [ │
│ 0, │
│ "0.028*\",\" + 0.010*\".\" + 0.004*\";\" + 0.004*\"'\" + … │
│ ], │
│ [ │
│ 1, │
│ "0.069*\",\" + 0.025*\".\" + 0.013*\";\" + 0.008*\"!\" + … │
│ ], │
│ [ │
│ 2, │
│ "0.056*\",\" + 0.027*\".\" + 0.012*\";\" + 0.007*\"'\" + … │
│ ], │
│ [ │
│ 3, │
│ "0.110*\",\" + 0.040*\".\" + 0.012*\";\" + 0.006*\"!\" + … │
│ ], │
│ [ │
│ 4, │
│ "0.029*\",\" + 0.017*\".\" + 0.006*\";\" + 0.004*\"!\" + … │
│ ], │
│ [ │
│ 5, │
│ "0.015*\",\" + 0.010*\".\" + 0.004*\";\" + 0.003*\"'\" + … │
│ ], │
│ [ │
│ 6, │
│ "0.057*\",\" + 0.033*\".\" + 0.013*\";\" + 0.008*\"!\" + … │
│ ], │
│ [ │
│ 7, │
│ "0.077*\",\" + 0.022*\".\" + 0.012*\";\" + 0.007*\"!\" + … │
│ ] │
│ ], │
│ "9": [ │
│ [ │
│ 0, │
│ "0.064*\",\" + 0.023*\".\" + 0.010*\";\" + 0.006*\"!\" + … │
│ ], │
│ [ │
│ 1, │
│ "0.049*\",\" + 0.032*\".\" + 0.009*\";\" + 0.005*\"'\" + … │
│ ], │
│ [ │
│ 2, │
│ "0.079*\",\" + 0.020*\".\" + 0.010*\";\" + 0.004*\":\" + … │
│ ], │
│ [ │
│ 3, │
│ "0.109*\",\" + 0.030*\".\" + 0.015*\";\" + 0.007*\"!\" + … │
│ ], │
│ [ │
│ 4, │
│ "0.051*\",\" + 0.030*\".\" + 0.010*\";\" + 0.005*\"''\" +… │
│ ], │
│ [ │
│ 5, │
│ "0.047*\",\" + 0.016*\".\" + 0.008*\";\" + 0.005*\"!\" + … │
│ ], │
│ [ │
│ 6, │
│ "0.074*\",\" + 0.030*\".\" + 0.010*\";\" + 0.006*\"!\" + … │
│ ], │
│ [ │
│ 7, │
│ "0.079*\",\" + 0.039*\".\" + 0.012*\";\" + 0.007*\"!\" + … │
│ ], │
│ [ │
│ 8, │
│ "0.094*\",\" + 0.041*\".\" + 0.012*\";\" + 0.007*\"'\" + … │
│ ] │
│ ] │
│ } │
│ dict schema { │
│ "title": "dict", │
│ "type": "object" │
│ } │
│ │
│ │
│ snapshot timeline │
│ 2022-10-03 13:07:56.228937+00:00 zdpuB2sHA2sfuLESmdATMSZm5Bi9TSJS697TGGVuJfvEqGrdT │
│ │
│ current state id zdpuB2sHA2sfuLESmdATMSZm5Bi9TSJS697TGGVuJfvEqGrdT │
│ current state details │
│ pipeline inputs │
│ field name status required default value id │
│ ─────────────────────────────────────────────────────────────────────────────────────────── │
│ create_stopwords_list__lang… valid no 76394b45-2dcb-44ca-b22b-01e0 │
│ ee25bd9f │
│ create_stopwords_list__stop… valid no cfd4f58b-358d-4640-af08-b745 │
│ ffaa7fd3 │
│ import_text_corpus__path valid yes 8721e6c2-9215-4020-93aa-f2fa │
│ 90eec13b │
│ extract_filename_column__co… valid yes 0b6bd257-60c6-4434-b66c-e855 │
│ aa203622 │
│ extract_texts_column__colum… valid yes fd3ef854-c982-4589-b5c4-112a │
│ ae1c597e │
│ create_date_array__force_no… valid no True 00000000-0000-0000-0000-0000 │
│ 00000002 │
│ create_date_array__max_index valid no a3846cdb-68cb-461a-9635-d0e9 │
│ bb73681e │
│ create_date_array__min_index valid no 4e8b9a9d-80e0-4595-a244-4007 │
│ 46a2215b │
│ create_date_array__remove_t… valid no [] 00000000-0000-0000-0000-0000 │
│ 00000002 │
│ tokenize_content__tokenize_… valid no True 00000000-0000-0000-0000-0000 │
│ 00000002 │
│ preprocess_corpus__remove_a… valid no False 00000000-0000-0000-0000-0000 │
│ 00000002 │
│ preprocess_corpus__remove_a… valid no False 00000000-0000-0000-0000-0000 │
│ 00000002 │
│ preprocess_corpus__remove_n… valid no False 00000000-0000-0000-0000-0000 │
│ 00000002 │
│ preprocess_corpus__remove_s… valid no False 00000000-0000-0000-0000-0000 │
│ 00000002 │
│ preprocess_corpus__to_lower… valid no False 00000000-0000-0000-0000-0000 │
│ 00000002 │
│ generate_lda__compute_coher… valid no False 0f2ecd3e-15a8-482b-b0e2-20b6 │
│ 5d5d4f53 │
│ generate_lda__num_topics_max valid no f9e07551-de8d-4e36-a9f6-cffc │
│ 01d4c149 │
│ generate_lda__num_topics_min valid no 7 220f512a-03c0-4693-bba7-7314 │
│ 2b8522ae │
│ generate_lda__words_per_top… valid no 10 00000000-0000-0000-0000-0000 │
│ 00000002 │
│ │
│ steps steps │
│ ├── stage: 1 │
│ │ ├── step: create_stopwords_list │
│ │ │ └── status: results ready │
│ │ └── step: import_text_corpus │
│ │ └── status: results ready │
│ ├── stage: 2 │
│ │ └── step: create_text_corpus │
│ │ └── status: results ready │
│ ├── stage: 3 │
│ │ ├── step: extract_filename_column │
│ │ │ └── status: results ready │
│ │ └── step: extract_texts_column │
│ │ └── status: results ready │
│ ├── stage: 4 │
│ │ ├── step: create_date_array │
│ │ │ └── status: results ready │
│ │ └── step: tokenize_content │
│ │ └── status: results ready │
│ ├── stage: 5 │
│ │ └── step: preprocess_corpus │
│ │ └── status: results ready │
│ └── stage: 6 │
│ └── step: generate_lda │
│ └── status: results ready │
│ pipeline outputs │
│ field name status type value id │
│ ─────────────────────────────────────────────────────────────────────────────────────────── │
│ create_stopwords_list__stopword… valid list c4c81256-c387-471e-8a05-76890ea │
│ efbcd │
│ import_text_corpus__file_bundle valid file_bundle 9856156c-b8dd-46c4-820d-e7f4934 │
│ f89c9 │
│ create_text_corpus__table valid table 640866c0-dac7-48f3-82d4-8a85617 │
│ ca072 │
│ extract_filename_column__array valid array b74e8402-ee7c-446f-8f3f-6fe9048 │
│ d1e7e │
│ extract_texts_column__array valid array 4b002cd8-2e48-456c-8cae-5479c5b │
│ 1d612 │
│ create_date_array__date_array valid array 4abcfa2e-8ba0-4b04-8414-2a3e5ac │
│ 38eea │
│ tokenize_content__tokens_array valid array e184c7e5-83c6-4821-ad3a-54b79af │
│ ff9aa │
│ preprocess_corpus__tokens_array valid array 685323a1-0db9-41b8-a518-c247a08 │
│ 9e6ca │
│ generate_lda__coherence_map valid dict c746a2dd-a587-42c1-9967-9753789 │
│ bf616 │
│ generate_lda__coherence_table valid table 5fd0a91d-7682-4b08-a0a9-f1eb361 │
│ d60f0 │
│ generate_lda__topic_models valid dict 2c1ad51f-badf-4276-b368-d2e144e │
│ 361bc │
│ │
│ │
│ │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯