%env CONSOLE_WIDTH=140

from kiara.interfaces.python_api.workflow import Workflow
from kiara.utils.jupyter import graph_to_image
from kiara.utils.cli import terminal_print_model

env: CONSOLE_WIDTH=140

Creating the workflow object ¶

As the first step we create a Workflow object, which is a convenience class that manages workflow state, internal consistency and history for us:

doc = """Example topic-modeling end-to-end workflow."""
workflow = Workflow.create("topic_modeling", doc=doc, replace_existing_alias=True)

Assembling the workflow ¶

The first step in the creation of our workflow is to create the individual steps from the available kiara modules.

A list of available modules and their aliases can be found here: TODO

Creating the steps of the workflow ¶

# Creating step: import_text_corpus
workflow.add_step(operation="import.file_bundle", step_id="import_text_corpus")

╭─ Step: import_text_corpus ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│                                                                                                                                          │
│   step_id       import_text_corpus                                                                                                       │
│   module type   import.file_bundle                                                                                                       │
│   module doc    Import a folder (file_bundle) from the local filesystem.                                                                 │
│   inputs                                                                                                                                 │
│                   field name                type     description                                         Required   Default              │
│                  ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────    │
│                   import_text_corpus.path   string   The local path of the folder to import.             yes        -- no default --     │
│                                                                                                                                          │
│   outputs                                                                                                                                │
│                   field name                       type          description                                                             │
│                  ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────    │
│                   import_text_corpus.file_bundle   file_bundle   The imported file bundle.                                               │
│                                                                                                                                          │
│                                                                                                                                          │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯

# Creating step: create_stopwords_list
workflow.add_step(operation="create.stopwords_list", step_id="create_stopwords_list")

╭─ Step: create_stopwords_list ────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│                                                                                                                                          │
│   step_id       create_stopwords_list                                                                                                    │
│   module type   create.stopwords_list                                                                                                    │
│   module doc    Create a list of stopwords from one or multiple sources.                                                                 │
│                                                                                                                                          │
│                 This will download nltk stopwords if necessary, and merge all input lists into a single, sorted list without             │
│                 duplicates.                                                                                                              │
│   inputs                                                                                                                                 │
│                   field name                             type   description                              Required   Default              │
│                  ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────    │
│                   create_stopwords_list.languages        list   A list of languages, will be used to     no         -- no default --     │
│                                                                 retrieve language-specific stopword                                      │
│                                                                 from nltk.                                                               │
│                   create_stopwords_list.stopword_lists   list   A list of lists of stopwords.            no         -- no default --     │
│                                                                                                                                          │
│   outputs                                                                                                                                │
│                   field name                             type   description                                                              │
│                  ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────    │
│                   create_stopwords_list.stopwords_list   list   A sorted list of unique stopwords.                                       │
│                                                                                                                                          │
│                                                                                                                                          │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯

# Creating step: create_text_corpus
step_create_text_corpus_config = {'constants': {}, 'defaults': {}, 'source_type': 'text_file_bundle', 'target_type': 'table', 'ignore_errors': False}
workflow.add_step(
    operation="create.table",
    module_config=step_create_text_corpus_config,
    step_id="create_text_corpus")

╭─ Step: create_text_corpus ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│                                                                                                                                          │
│   step_id         create_text_corpus                                                                                                     │
│   module type     create.table                                                                                                           │
│   module_config   {                                                                                                                      │
│                     "source_type": "text_file_bundle",                                                                                   │
│                     "target_type": "table",                                                                                              │
│                     "ignore_errors": false                                                                                               │
│                   }                                                                                                                      │
│   module doc      -- n/a --                                                                                                              │
│   inputs                                                                                                                                 │
│                     field name                       type               description                      Required   Default              │
│                    ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────    │
│                     create_text_corpus.text_file_b   text_file_bundle   The type of the source value.    yes        -- no default --     │
│                     undle                                                                                                                │
│                                                                                                                                          │
│   outputs                                                                                                                                │
│                     field name                 type    description                                                                       │
│                    ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────    │
│                     create_text_corpus.table   table   The result value.                                                                 │
│                                                                                                                                          │
│                                                                                                                                          │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯

# Connecting input(s) of step 'create_text_corpus'
workflow.connect_fields("create_text_corpus.text_file_bundle", "import_text_corpus.file_bundle")

# Creating step: extract_texts_column
workflow.add_step(operation="table.cut_column", step_id="extract_texts_column")

╭─ Step: extract_texts_column ─────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│                                                                                                                                          │
│   step_id       extract_texts_column                                                                                                     │
│   module type   table.cut_column                                                                                                         │
│   module doc    Cut off one column from a table, returning an array.                                                                     │
│   inputs                                                                                                                                 │
│                   field name                         type     description                                Required   Default              │
│                  ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────    │
│                   extract_texts_column.table         table    A table.                                   yes        -- no default --     │
│                   extract_texts_column.column_name   string   The name of the column to extract.         yes        -- no default --     │
│                                                                                                                                          │
│   outputs                                                                                                                                │
│                   field name                   type    description                                                                       │
│                  ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────    │
│                   extract_texts_column.array   array   The column.                                                                       │
│                                                                                                                                          │
│                                                                                                                                          │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯

# Connecting input(s) of step 'extract_texts_column'
workflow.connect_fields("extract_texts_column.table", "create_text_corpus.table")

# Creating step: extract_filename_column
workflow.add_step(operation="table.cut_column", step_id="extract_filename_column")

╭─ Step: extract_filename_column ──────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│                                                                                                                                          │
│   step_id       extract_filename_column                                                                                                  │
│   module type   table.cut_column                                                                                                         │
│   module doc    Cut off one column from a table, returning an array.                                                                     │
│   inputs                                                                                                                                 │
│                   field name                            type     description                             Required   Default              │
│                  ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────    │
│                   extract_filename_column.table         table    A table.                                yes        -- no default --     │
│                   extract_filename_column.column_name   string   The name of the column to extract.      yes        -- no default --     │
│                                                                                                                                          │
│   outputs                                                                                                                                │
│                   field name                      type    description                                                                    │
│                  ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────    │
│                   extract_filename_column.array   array   The column.                                                                    │
│                                                                                                                                          │
│                                                                                                                                          │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯

# Connecting input(s) of step 'extract_filename_column'
workflow.connect_fields("extract_filename_column.table", "create_text_corpus.table")

# Creating step: create_date_array
workflow.add_step(operation="parse.date_array", step_id="create_date_array")

╭─ Step: create_date_array ────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│                                                                                                                                          │
│   step_id         create_date_array                                                                                                      │
│   module type     parse.date_array                                                                                                       │
│   module_config   {                                                                                                                      │
│                     "add_inputs": true,                                                                                                  │
│                     "input_fields": [],                                                                                                  │
│                     "force_non_null": true,                                                                                              │
│                     "min_index": null,                                                                                                   │
│                     "max_index": null,                                                                                                   │
│                     "remove_tokens": []                                                                                                  │
│                   }                                                                                                                      │
│   module doc      Create an array of date objects from an array of strings.                                                              │
│                                                                                                                                          │
│                   This module is very simplistic at the moment, more functionality and options will be added in the future.              │
│                                                                                                                                          │
│                   At its core, this module uses the standard parser from the dateutil package to parse strings into dates. As this       │
│                   parser can't handle complex strings, the input strings can be pre-processed in the following ways:                     │
│                                                                                                                                          │
│                    • 'cut' non-relevant parts of the string (using 'min_index' & 'max_index' input/config options)                       │
│                    • remove matching tokens from the string, and replace them with a single whitespace (using the 'remove_tokens'        │
│                      option)                                                                                                             │
│                                                                                                                                          │
│                   By default, if an input string can't be parsed this module will raise an exception. This can be prevented by setting   │
│                   this modules 'force_non_null' config option or input to 'False', in which case un-parsable strings will appear as      │
│                   'NULL' value in the resulting array.                                                                                   │
│   inputs                                                                                                                                 │
│                     field name                         type      description                             Required   Default              │
│                    ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────    │
│                     create_date_array.array            array     The input array.                        yes        -- no default --     │
│                     create_date_array.force_non_null   boolean   If set to 'True', raise an error if     no         True                 │
│                                                                  any of the strings in the array can't                                   │
│                                                                  be parsed.                                                              │
│                     create_date_array.min_index        integer   The minimum index from where to start   no         -- no default --     │
│                                                                  parsing the string(s).                                                  │
│                     create_date_array.max_index        integer   The maximum index until whic to parse   no         -- no default --     │
│                                                                  the string(s).                                                          │
│                     create_date_array.remove_tokens    list      A list of tokens/characters to          no         []                   │
│                                                                  replace with a single white-space                                       │
│                                                                  before parsing the input.                                               │
│                                                                                                                                          │
│   outputs                                                                                                                                │
│                     field name                     type    description                                                                   │
│                    ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────    │
│                     create_date_array.date_array   array   The resulting array with items of a date data type.                           │
│                                                                                                                                          │
│                                                                                                                                          │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯

# Connecting input(s) of step 'create_date_array'
workflow.connect_fields("create_date_array.array", "extract_filename_column.array")

# Creating step: tokenize_content
workflow.add_step(operation="tokenize.texts_array", step_id="tokenize_content")

╭─ Step: tokenize_content ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│                                                                                                                                          │
│   step_id       tokenize_content                                                                                                         │
│   module type   tokenize.texts_array                                                                                                     │
│   module doc    Split sentences into words or words into characters.                                                                     │
│                                                                                                                                          │
│                 In other words, this operation establishes the word boundaries (i.e., tokens) a very helpful way of finding patterns.    │
│                 It is also the typical step prior to stemming and lemmatization                                                          │
│   inputs                                                                                                                                 │
│                   field name                          type      description                              Required   Default              │
│                  ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────    │
│                   tokenize_content.texts_array        array     An array of text items to be             yes        -- no default --     │
│                                                                 tokenized.                                                               │
│                   tokenize_content.tokenize_by_word   boolean   Whether to tokenize by word (default),   no         True                 │
│                                                                 or character.                                                            │
│                                                                                                                                          │
│   outputs                                                                                                                                │
│                   field name                      type    description                                                                    │
│                  ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────    │
│                   tokenize_content.tokens_array   array   The tokenized content, as an array of lists of strings.                        │
│                                                                                                                                          │
│                                                                                                                                          │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯

# Connecting input(s) of step 'tokenize_content'
workflow.connect_fields("tokenize_content.texts_array", "extract_texts_column.array")

# Creating step: preprocess_corpus
workflow.add_step(operation="preprocess.tokens_array", step_id="preprocess_corpus")

╭─ Step: preprocess_corpus ────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│                                                                                                                                          │
│   step_id       preprocess_corpus                                                                                                        │
│   module type   preprocess.tokens_array                                                                                                  │
│   module doc    Preprocess lists of tokens, incl. lowercasing, remove special characers, etc.                                            │
│                                                                                                                                          │
│                 Lowercasing: Lowercase the words. This operation is a double-edged sword. It can be effective at yielding potentially    │
│                 better results in the case of relatively small datasets or datatsets with a high percentage of OCR mistakes. For         │
│                 instance, if lowercasing is not performed, the algorithm will treat USA, Usa, usa, UsA, uSA, etc. as distinct tokens,    │
│                 even though they may all refer to the same entity. On the other hand, if the dataset does not contain such OCR           │
│                 mistakes, then it may become difficult to distinguish between homonyms and make interpreting the topics much harder.     │
│                                                                                                                                          │
│                 Removing stopwords and words with less than three characters: Remove low information words. These are typically words    │
│                 such as articles, pronouns, prepositions, conjunctions, etc. which are not semantically salient. There are numerous      │
│                 stopword lists available for many, though not all, languages which can be easily adapted to the individual               │
│                 researcher's needs. Removing words with less than three characters may additionally remove many OCR mistakes. Both       │
│                 these operations have the dual advantage of yielding more reliable results while reducing the size of the dataset,       │
│                 thus in turn reducing the required processing power. This step can therefore hardly be considered optional in TM.        │
│                                                                                                                                          │
│                 Noise removal: Remove elements such as punctuation marks, special characters, numbers, html formatting, etc. This        │
│                 operation is again concerned with removing elements that may not be relevant to the text analysis and in fact            │
│                 interfere with it. Depending on the dataset and research question, this operation can become essential.                  │
│   inputs                                                                                                                                 │
│                   field name                            type      description                            Required   Default              │
│                  ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────    │
│                   preprocess_corpus.tokens_array        array     The tokens array to pre-process.       yes        -- no default --     │
│                   preprocess_corpus.to_lowercase        boolean   Apply lowercasing to the text.         no         False                │
│                   preprocess_corpus.remove_alphanumer   boolean   Remove all tokens that include         no         False                │
│                   ic                                              numbers (e.g. ex1ample).                                               │
│                   preprocess_corpus.remove_non_alpha    boolean   Remove all tokens that include         no         False                │
│                                                                   punctuation and numbers (e.g.                                          │
│                                                                   ex1a.mple).                                                            │
│                   preprocess_corpus.remove_all_numeri   boolean   Remove all tokens that contain         no         False                │
│                   c                                               numbers only (e.g. 876).                                               │
│                   preprocess_corpus.remove_short_toke   integer   Remove tokens shorter than a certain   no         False                │
│                   ns                                              length. If value is <= 0, no                                           │
│                                                                   filtering will be done.                                                │
│                   preprocess_corpus.remove_stopwords    list      Remove stopwords.                      no         -- no default --     │
│                                                                                                                                          │
│   outputs                                                                                                                                │
│                   field name                       type    description                                                                   │
│                  ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────    │
│                   preprocess_corpus.tokens_array   array   The pre-processed content, as an array of lists of strings.                   │
│                                                                                                                                          │
│                                                                                                                                          │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯

# Connecting input(s) of step 'preprocess_corpus'
workflow.connect_fields("preprocess_corpus.tokens_array", "tokenize_content.tokens_array")
workflow.connect_fields("preprocess_corpus.remove_stopwords", "create_stopwords_list.stopwords_list")

# Creating step: generate_lda
workflow.add_step(operation="generate.LDA.for.tokens_array", step_id="generate_lda")

╭─ Step: generate_lda ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│                                                                                                                                          │
│   step_id       generate_lda                                                                                                             │
│   module type   generate.LDA.for.tokens_array                                                                                            │
│   module doc    Perform Latent Dirichlet Allocation on a tokenized corpus.                                                               │
│                                                                                                                                          │
│                 This module computes models for a range of number of topics provided by the user.                                        │
│   inputs                                                                                                                                 │
│                   field name                       type      description                                 Required   Default              │
│                  ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────    │
│                   generate_lda.tokens_array        array     The text corpus.                            yes        -- no default --     │
│                   generate_lda.num_topics_min      integer   The minimal number of topics.               no         7                    │
│                   generate_lda.num_topics_max      integer   The max number of topics.                   no         -- no default --     │
│                   generate_lda.compute_coherence   boolean   Whether to compute the coherence score      no         False                │
│                                                              for each model.                                                             │
│                   generate_lda.words_per_topic     integer   How many words per topic to put in the      no         10                   │
│                                                              result model.                                                               │
│                                                                                                                                          │
│   outputs                                                                                                                                │
│                   field name                     type    description                                                                     │
│                  ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────    │
│                   generate_lda.topic_models      dict    A dictionary with one coherence model table for each number of topics.          │
│                   generate_lda.coherence_table   table   Coherence details.                                                              │
│                   generate_lda.coherence_map     dict    A map with the coherence value for every number of topics.                      │
│                                                                                                                                          │
│                                                                                                                                          │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯

# Connecting input(s) of step 'generate_lda'
workflow.connect_fields("generate_lda.tokens_array", "preprocess_corpus.tokens_array")

Setting workflow input/output names (optional)¶

To make our workflow nicer to use, we can set aliases for its inputs and outputs.

workflow.set_input_alias(input_field="extract_texts_column.column_name", alias="content_column_name")
workflow.set_input_alias(input_field="extract_filename_column.column_name", alias="filename_column_name")
workflow.set_input_alias(input_field="import_text_corpus.path", alias="text_corpus_folder_path")
workflow.set_input_alias(input_field="create_date_array.min_index", alias="date_parse_min")
workflow.set_input_alias(input_field="create_date_array.max_index", alias="date_parse_max")
workflow.set_input_alias(input_field="create_date_array.force_non_null", alias="date_force_non_null")
workflow.set_input_alias(input_field="create_date_array.remove_tokens", alias="date_remove_tokensl")
workflow.set_input_alias(input_field="tokenize_content.tokenize_by_word", alias="tokenize_by_word")
workflow.set_input_alias(input_field="generate_lda.num_topics_min", alias="num_topics_min")
workflow.set_input_alias(input_field="generate_lda.num_topics_max", alias="num_topics_max")
workflow.set_input_alias(input_field="generate_lda.compute_coherence", alias="compute_coherence")
workflow.set_input_alias(input_field="generate_lda.words_per_topic", alias="words_per_topic")
workflow.set_input_alias(input_field="create_stopwords_list.languages", alias="languages")
workflow.set_input_alias(input_field="create_stopwords_list.stopword_lists", alias="stopword_lists")
workflow.set_input_alias(input_field="preprocess_corpus.to_lowercase", alias="to_lowercase")
workflow.set_input_alias(input_field="preprocess_corpus.remove_alphanumeric", alias="remove_alphanumeric")
workflow.set_input_alias(input_field="preprocess_corpus.remove_non_alpha", alias="remove_non_alpha")
workflow.set_input_alias(input_field="preprocess_corpus.remove_all_numeric", alias="remove_all_numeric")
workflow.set_input_alias(input_field="preprocess_corpus.remove_short_tokens", alias="remove_short_tokens")
workflow.set_input_alias(input_field="preprocess_corpus.remove_stopwords", alias="remove_stopwords")


workflow.set_output_alias(output_field="import_text_corpus.file_bundle", alias="text_corpus_file_bundle")
workflow.set_output_alias(output_field="create_text_corpus.table", alias="text_corpus_table")
workflow.set_output_alias(output_field="extract_texts_column.array", alias="content_array")
workflow.set_output_alias(output_field="tokenize_content.tokens_array", alias="tokenized_corpus")
workflow.set_output_alias(output_field="preprocess_corpus.tokens_array", alias="preprocessed_corpus")
workflow.set_output_alias(output_field="generate_lda.topic_models", alias="topic_models")
workflow.set_output_alias(output_field="generate_lda.coherence_map", alias="coherence_map")
workflow.set_output_alias(output_field="generate_lda.coherence_table", alias="coherence_table")
workflow.set_output_alias(output_field="create_date_array.date_array", alias="date_array")

Workflow information ¶

After our workflow is wired up, we look can look at its structure, and other properties.

Workflow status¶

A workflow consists of a series of 'states', the most relevant is always the most recent one. We can investigate that latest states details like so:

workflow.current_state

╭──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│                                                                                                                                          │
│  state id         zdpuAuzjRbxHaGKvGVJ7rxJQ27Hfy2k7qLTxC3vxowNLwnH4U                                                                      │
│  pipeline inputs                                                                                                                         │
│                     field name                               status    required   default   value id                                     │
│                    ──────────────────────────────────────────────────────────────────────────────────────────────────────────────        │
│                     create_stopwords_list__languages         valid     no                   00000000-0000-0000-0000-000000000001         │
│                     create_stopwords_list__stopword_lists    valid     no                   00000000-0000-0000-0000-000000000001         │
│                     import_text_corpus__path                 not set   yes                  00000000-0000-0000-0000-000000000001         │
│                     extract_filename_column__column_name     not set   yes                  00000000-0000-0000-0000-000000000001         │
│                     extract_texts_column__column_name        not set   yes                  00000000-0000-0000-0000-000000000001         │
│                     create_date_array__force_non_null        valid     no         True      00000000-0000-0000-0000-000000000001         │
│                     create_date_array__max_index             valid     no                   00000000-0000-0000-0000-000000000001         │
│                     create_date_array__min_index             valid     no                   00000000-0000-0000-0000-000000000001         │
│                     create_date_array__remove_tokens         valid     no         []        00000000-0000-0000-0000-000000000001         │
│                     tokenize_content__tokenize_by_word       valid     no         True      00000000-0000-0000-0000-000000000001         │
│                     preprocess_corpus__remove_all_numeric    valid     no         False     00000000-0000-0000-0000-000000000001         │
│                     preprocess_corpus__remove_alphanumeric   valid     no         False     00000000-0000-0000-0000-000000000001         │
│                     preprocess_corpus__remove_non_alpha      valid     no         False     00000000-0000-0000-0000-000000000001         │
│                     preprocess_corpus__remove_short_tokens   valid     no         False     00000000-0000-0000-0000-000000000001         │
│                     preprocess_corpus__to_lowercase          valid     no         False     00000000-0000-0000-0000-000000000001         │
│                     generate_lda__compute_coherence          valid     no         False     00000000-0000-0000-0000-000000000001         │
│                     generate_lda__num_topics_max             valid     no                   00000000-0000-0000-0000-000000000001         │
│                     generate_lda__num_topics_min             valid     no         7         00000000-0000-0000-0000-000000000001         │
│                     generate_lda__words_per_topic            valid     no         10        00000000-0000-0000-0000-000000000001         │
│                                                                                                                                          │
│  steps            steps                                                                                                                  │
│                   ├── stage: 1                                                                                                           │
│                   │   ├── step: create_stopwords_list                                                                                    │
│                   │   │   └── status: inputs ready                                                                                       │
│                   │   └── step: import_text_corpus                                                                                       │
│                   │       └── status: inputs invalid                                                                                     │
│                   │           └── path: not set                                                                                          │
│                   ├── stage: 2                                                                                                           │
│                   │   └── step: create_text_corpus                                                                                       │
│                   │       └── status: inputs invalid                                                                                     │
│                   │           └── text_file_bundle: not set                                                                              │
│                   ├── stage: 3                                                                                                           │
│                   │   ├── step: extract_filename_column                                                                                  │
│                   │   │   └── status: inputs invalid                                                                                     │
│                   │   │       ├── table: not set                                                                                         │
│                   │   │       └── column_name: not set                                                                                   │
│                   │   └── step: extract_texts_column                                                                                     │
│                   │       └── status: inputs invalid                                                                                     │
│                   │           ├── table: not set                                                                                         │
│                   │           └── column_name: not set                                                                                   │
│                   ├── stage: 4                                                                                                           │
│                   │   ├── step: create_date_array                                                                                        │
│                   │   │   └── status: inputs invalid                                                                                     │
│                   │   │       └── array: not set                                                                                         │
│                   │   └── step: tokenize_content                                                                                         │
│                   │       └── status: inputs invalid                                                                                     │
│                   │           └── texts_array: not set                                                                                   │
│                   ├── stage: 5                                                                                                           │
│                   │   └── step: preprocess_corpus                                                                                        │
│                   │       └── status: inputs invalid                                                                                     │
│                   │           └── tokens_array: not set                                                                                  │
│                   └── stage: 6                                                                                                           │
│                       └── step: generate_lda                                                                                             │
│                           └── status: inputs invalid                                                                                     │
│                               └── tokens_array: not set                                                                                  │
│  pipeline outputs                                                                                                                        │
│                     field name                              status    type   value id                                                    │
│                    ───────────────────────────────────────────────────────────────────────────────────────────────                       │
│                     create_stopwords_list__stopwords_list   not set   none   00000000-0000-0000-0000-000000000001                        │
│                     import_text_corpus__file_bundle         not set   none   00000000-0000-0000-0000-000000000001                        │
│                     create_text_corpus__table               not set   none   00000000-0000-0000-0000-000000000001                        │
│                     extract_filename_column__array          not set   none   00000000-0000-0000-0000-000000000001                        │
│                     extract_texts_column__array             not set   none   00000000-0000-0000-0000-000000000001                        │
│                     create_date_array__date_array           not set   none   00000000-0000-0000-0000-000000000001                        │
│                     tokenize_content__tokens_array          not set   none   00000000-0000-0000-0000-000000000001                        │
│                     preprocess_corpus__tokens_array         not set   none   00000000-0000-0000-0000-000000000001                        │
│                     generate_lda__coherence_map             not set   none   00000000-0000-0000-0000-000000000001                        │
│                     generate_lda__coherence_table           not set   none   00000000-0000-0000-0000-000000000001                        │
│                     generate_lda__topic_models              not set   none   00000000-0000-0000-0000-000000000001                        │
│                                                                                                                                          │
│                                                                                                                                          │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯

Pipeline execution graph¶

Let's look at the current execution graph for the current workflow pipeline:

graph_to_image(workflow.pipeline.execution_graph)

Workflow inputs ¶

Once a workflow has an assembled pipeline, we can set it's inputs. We use the input field names that we got from the result of the workflow.current_state call.

workflow.set_input("text_corpus_folder_path", "/home/markus/projects/kiara/dev/kiara.examples/examples/pipelines/topic_modeling/../../data/text_corpus/data")
workflow.set_input("content_column_name", "content")
workflow.set_input("filename_column_name", "file_name")
workflow.set_input("date_force_non_null", None)
workflow.set_input("date_parse_min", 11)
workflow.set_input("date_parse_max", 21)
workflow.set_input("date_remove_tokensl", None)
workflow.set_input("tokenize_by_word", None)
workflow.set_input("languages", ['italian'])
workflow.set_input("stopword_lists", [])
workflow.set_input("to_lowercase", None)
workflow.set_input("remove_alphanumeric", None)
workflow.set_input("remove_non_alpha", None)
workflow.set_input("remove_all_numeric", None)
workflow.set_input("remove_short_tokens", None)
workflow.set_input("num_topics_min", 7)
workflow.set_input("num_topics_max", 9)
workflow.set_input("compute_coherence", True)
workflow.set_input("words_per_topic", None)


# process all workflow steps that can be processed
workflow.process_steps()

# print the current state, after we set our inputs
workflow.current_state

╭──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│                                                                                                                                          │
│  state id         zdpuB2sHA2sfuLESmdATMSZm5Bi9TSJS697TGGVuJfvEqGrdT                                                                      │
│  pipeline inputs                                                                                                                         │
│                     field name                               status   required   default   value id                                      │
│                    ─────────────────────────────────────────────────────────────────────────────────────────────────────────────         │
│                     create_stopwords_list__languages         valid    no                   76394b45-2dcb-44ca-b22b-01e0ee25bd9f          │
│                     create_stopwords_list__stopword_lists    valid    no                   cfd4f58b-358d-4640-af08-b745ffaa7fd3          │
│                     import_text_corpus__path                 valid    yes                  8721e6c2-9215-4020-93aa-f2fa90eec13b          │
│                     extract_filename_column__column_name     valid    yes                  0b6bd257-60c6-4434-b66c-e855aa203622          │
│                     extract_texts_column__column_name        valid    yes                  fd3ef854-c982-4589-b5c4-112aae1c597e          │
│                     create_date_array__force_non_null        valid    no         True      00000000-0000-0000-0000-000000000002          │
│                     create_date_array__max_index             valid    no                   a3846cdb-68cb-461a-9635-d0e9bb73681e          │
│                     create_date_array__min_index             valid    no                   4e8b9a9d-80e0-4595-a244-400746a2215b          │
│                     create_date_array__remove_tokens         valid    no         []        00000000-0000-0000-0000-000000000002          │
│                     tokenize_content__tokenize_by_word       valid    no         True      00000000-0000-0000-0000-000000000002          │
│                     preprocess_corpus__remove_all_numeric    valid    no         False     00000000-0000-0000-0000-000000000002          │
│                     preprocess_corpus__remove_alphanumeric   valid    no         False     00000000-0000-0000-0000-000000000002          │
│                     preprocess_corpus__remove_non_alpha      valid    no         False     00000000-0000-0000-0000-000000000002          │
│                     preprocess_corpus__remove_short_tokens   valid    no         False     00000000-0000-0000-0000-000000000002          │
│                     preprocess_corpus__to_lowercase          valid    no         False     00000000-0000-0000-0000-000000000002          │
│                     generate_lda__compute_coherence          valid    no         False     0f2ecd3e-15a8-482b-b0e2-20b65d5d4f53          │
│                     generate_lda__num_topics_max             valid    no                   f9e07551-de8d-4e36-a9f6-cffc01d4c149          │
│                     generate_lda__num_topics_min             valid    no         7         220f512a-03c0-4693-bba7-73142b8522ae          │
│                     generate_lda__words_per_topic            valid    no         10        00000000-0000-0000-0000-000000000002          │
│                                                                                                                                          │
│  steps            steps                                                                                                                  │
│                   ├── stage: 1                                                                                                           │
│                   │   ├── step: create_stopwords_list                                                                                    │
│                   │   │   └── status: results ready                                                                                      │
│                   │   └── step: import_text_corpus                                                                                       │
│                   │       └── status: results ready                                                                                      │
│                   ├── stage: 2                                                                                                           │
│                   │   └── step: create_text_corpus                                                                                       │
│                   │       └── status: results ready                                                                                      │
│                   ├── stage: 3                                                                                                           │
│                   │   ├── step: extract_filename_column                                                                                  │
│                   │   │   └── status: results ready                                                                                      │
│                   │   └── step: extract_texts_column                                                                                     │
│                   │       └── status: results ready                                                                                      │
│                   ├── stage: 4                                                                                                           │
│                   │   ├── step: create_date_array                                                                                        │
│                   │   │   └── status: results ready                                                                                      │
│                   │   └── step: tokenize_content                                                                                         │
│                   │       └── status: results ready                                                                                      │
│                   ├── stage: 5                                                                                                           │
│                   │   └── step: preprocess_corpus                                                                                        │
│                   │       └── status: results ready                                                                                      │
│                   └── stage: 6                                                                                                           │
│                       └── step: generate_lda                                                                                             │
│                           └── status: results ready                                                                                      │
│  pipeline outputs                                                                                                                        │
│                     field name                              status   type          value id                                              │
│                    ─────────────────────────────────────────────────────────────────────────────────────────────────────                 │
│                     create_stopwords_list__stopwords_list   valid    list          c4c81256-c387-471e-8a05-76890eaefbcd                  │
│                     import_text_corpus__file_bundle         valid    file_bundle   9856156c-b8dd-46c4-820d-e7f4934f89c9                  │
│                     create_text_corpus__table               valid    table         640866c0-dac7-48f3-82d4-8a85617ca072                  │
│                     extract_filename_column__array          valid    array         b74e8402-ee7c-446f-8f3f-6fe9048d1e7e                  │
│                     extract_texts_column__array             valid    array         4b002cd8-2e48-456c-8cae-5479c5b1d612                  │
│                     create_date_array__date_array           valid    array         4abcfa2e-8ba0-4b04-8414-2a3e5ac38eea                  │
│                     tokenize_content__tokens_array          valid    array         e184c7e5-83c6-4821-ad3a-54b79afff9aa                  │
│                     preprocess_corpus__tokens_array         valid    array         685323a1-0db9-41b8-a518-c247a089e6ca                  │
│                     generate_lda__coherence_map             valid    dict          c746a2dd-a587-42c1-9967-9753789bf616                  │
│                     generate_lda__coherence_table           valid    table         5fd0a91d-7682-4b08-a0a9-f1eb361d60f0                  │
│                     generate_lda__topic_models              valid    dict          2c1ad51f-badf-4276-b368-d2e144e361bc                  │
│                                                                                                                                          │
│                                                                                                                                          │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯

Workflow outputs ¶

To print the actual data of the workflows' current outputs, we call the current_output_values property of the workflow object:

workflow.current_output_values

╭──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│                                                                                                                                          │
│   field                     value                                                                                                        │
│  ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────  │
│   coherence_map                                                                                                                          │
│                               dict data     {                                                                                            │
│                                               "7": 0.23418388139804186,                                                                  │
│                                               "8": 0.2320449393569416,                                                                   │
│                                               "9": 0.232008834255219                                                                     │
│                                             }                                                                                            │
│                               dict schema   {                                                                                            │
│                                               "title": "dict",                                                                           │
│                                               "type": "object"                                                                           │
│                                             }                                                                                            │
│                                                                                                                                          │
│   coherence_table                                                                                                                        │
│                               topic_id   words                                                                            num_topics     │
│                              ────────────────────────────────────────────────────────────────────────────────────────────────────────    │
│                               1          sempre, j, perchè, 1, degenerato, mente, mai, no, solo, re, altri, to, te, egl   7              │
│                               2          no, j, re, perchè, to, 1, sempre, te, egli, degenerato, mente, do, fatto, essi   7              │
│                               3          perchè, 1, sempre, no, j, to, degenerato, egli, te, re, mente, oggi, essi, far   7              │
│                               4          sempre, j, no, altri, perchè, te, 1, degenerato, solo, to, re, fatto, egli, ta   7              │
│                               5          sempre, perchè, j, no, degenerato, te, solo, 1, egli, re, altri, do, essi, ri,   7              │
│                               6          perchè, degenerato, egli, no, j, re, 1, sempre, fatto, te, ta, to, oggi, quel,   7              │
│                               7          j, sempre, degenerato, 1, egli, no, to, oggi, te, re, mente, perchè, quando, e   7              │
│                               1          sempre, perchè, altri, 1, j, te, no, Figli, solo, re, po, nè, degenerato, to,    8              │
│                               2          j, sempre, degenerato, te, re, to, 1, perchè, mai, ta, egli, altri, no, oggi,    8              │
│                               3          sempre, 1, j, perchè, no, degenerato, fare, re, altri, solo, mente, nè, Figli,   8              │
│                               4          no, perchè, sempre, j, degenerato, 1, egli, to, re, te, mente, essi, solo, fat   8              │
│                               5          sempre, perchè, degenerato, 1, egli, j, re, mai, altri, no, do, te, giornale,    8              │
│                               6          sempre, altri, solo, to, j, perchè, mente, 1, re, State, ta, mai, nè, te, Figl   8              │
│                               7          sempre, j, re, perchè, degenerato, 1, egli, no, te, to, mai, mente, solo, oggi   8              │
│                               8          sempre, perchè, te, 1, j, re, egli, degenerato, no, to, fare, ta, oggi, mai, r   8              │
│                               1          sempre, perchè, j, no, altri, degenerato, re, te, 1, mente, essi, mai, egli, t   9              │
│                               2          no, perchè, sempre, j, 1, degenerato, to, egli, oggi, fatto, re, te, solo, mai   9              │
│                               3          sempre, te, perchè, j, no, to, 1, egli, altri, re, mente, mai, degenerato, sol   9              │
│                               4          sempre, j, degenerato, no, 1, re, te, perchè, fare, fatto, to, mente, mai, egl   9              │
│                               5          sempre, no, perchè, 1, j, altri, mente, to, fatto, degenerato, nè, re, solo, e   9              │
│                               6          sempre, egli, j, 1, no, degenerato, perchè, te, essi, quel, fatto, re, ta, ogg   9              │
│                               7          sempre, to, perchè, j, 1, mente, altri, re, solo, essi, te, fare, po, degenera   9              │
│                               8          sempre, perchè, egli, no, j, 1, degenerato, re, te, due, solo, to, altri, ment   9              │
│                               9          perchè, j, sempre, no, degenerato, egli, 1, to, re, te, oggi, giornale, solo,    9              │
│                                                                                                                                          │
│   content_array                                                                                                                          │
│                               LA RAGIONE                                                                                                 │
│                               LA RAG ONE                                                                                                 │
│                               LA RAGIONE                                                                                                 │
│                               contro i vili, i camorristi, i sicari, i falsari e gli austriacanti, nemici dell ...                       │
│                               contro i vili, i camorristi, i sicari, i falsari e gli austriacanti, nemici dell ...                       │
│                               LA RAGIONA                                                                                                 │
│                               LA RAGIONE                                                                                                 │
│                               LA RAGIONE                                                                                                 │
│                               contro i vili, i camorristi, i sicari, i falsari e gli austriacanti, nemici dell ...                       │
│                               LA RAG ONE                                                                                                 │
│                               contro 1 vili, i camorristi, i sicari, i falsari e gli austriacanti, nemici dell ...                       │
│                               ■■■                                                                                                        │
│                               La Rassegna                                                                                                │
│                               Both Phones                                                                                                │
│                               ■ jSrìt** W?? iIK 38®f- i^M                                                                                │
│                               ■Both Phones                                                                                               │
│                                                                                                                                          │
│   date_array                                                                                                                             │
│                               1917-04-25 00:00:00                                                                                        │
│                               1917-04-25 00:00:00                                                                                        │
│                               1917-04-25 00:00:00                                                                                        │
│                               1917-04-25 00:00:00                                                                                        │
│                               1917-05-05 00:00:00                                                                                        │
│                               1917-05-05 00:00:00                                                                                        │
│                               1917-05-05 00:00:00                                                                                        │
│                               1917-05-05 00:00:00                                                                                        │
│                               1917-05-16 00:00:00                                                                                        │
│                               1917-05-16 00:00:00                                                                                        │
│                               1917-05-16 00:00:00                                                                                        │
│                               1917-04-07 00:00:00                                                                                        │
│                               1917-04-14 00:00:00                                                                                        │
│                               1917-04-14 00:00:00                                                                                        │
│                               1917-04-21 00:00:00                                                                                        │
│                               1917-04-21 00:00:00                                                                                        │
│                                                                                                                                          │
│   preprocessed_corpus                                                                                                                    │
│                               ['RAGIONE', 'ORGANO', 'DIFESA', "ITALIANITÀ'", '1', 'vili', ',', 'camorristi', ' ...                       │
│                               ['RAG', 'ONE', 'vili', ',', 'camorristi', ',', 'sicari', ',', 'falsari', 'austri ...                       │
│                               ['RAGIONE', 'ORGANO', 'DIFESA', 'ITALIANITÀ', 'vili', ',', 'camorristi', ',', 's ...                       │
│                               ['vili', ',', 'camorristi', ',', 'sicari', ',', 'falsari', 'austriacanti', ',',  ...                       │
│                               ['vili', ',', 'camorristi', ',', 'sicari', ',', 'falsari', 'austriacanti', ',',  ...                       │
│                               ['RAGIONA', 'ORGANO', 'DIFESA', 'ITALIANITÀ', 'vili', ',', 'camorristi', ',', 's ...                       │
│                               ['RAGIONE', 'ORGANO', 'DIFESA', "ITALIANITÀ'", 'vili', ',', 'camorristi', ',', ' ...                       │
│                               ['RAGIONE', 'vili', ',', '1', 'camorristi', ',', 'sicari', ',', 'falsari', 'aust ...                       │
│                               ['vili', ',', 'camorristi', ',', 'sicari', ',', 'falsari', 'austriacanti', ',',  ...                       │
│                               ['RAG', 'ONE', 'ORGANO', 'DIFESA', 'ITALIANITÀ', "''", 'vili', ',', 'camorristi' ...                       │
│                               ['1', 'vili', ',', 'camorristi', ',', 'sicari', ',', 'falsari', 'austriacanti',  ...                       │
│                               ['■■■', 'Rassegna', '_', 'Both', 'Phones', 'ANNO', 'No', '.', '1', 'perche', "'" ...                       │
│                               ['Rassegna', 'Jjoth', 'Phones', 'ANNO', 'No', '.', '2', 'BASTA', '!', '...', 'qu ...                       │
│                               ['Both', 'Phones', 'ANNO', '.', 'No', '.', '2', 'BASTA', '!', '...', 'uà', 'quai ...                       │
│                               ['■', 'jSrìt', '*', '*', 'W', '?', '?', 'iIK', '38®f-', 'i^M', 'F', '<', '5É', ' ...                       │
│                               ['■Both', 'Phones', 'ANNO', '11', '.', 'No', '.', '5', 'COSE', 'POSTO', 'va', 'd ...                       │
│                                                                                                                                          │
│   text_corpus_file_bundle                                                                                                                │
│                               bundle name       data                                                                                     │
│                               number_of_files   16                                                                                       │
│                               size              298452                                                                                   │
│                               included files                                                                                             │
│                                                   (relative) path                                        size                            │
│                                                  ──────────────────────────────────────────────────────────────                          │
│                                                   La_Ragione/sn84037024_1917-04-25_ed-1_seq-1_ocr.txt    16613                           │
│                                                   La_Ragione/sn84037024_1917-04-25_ed-2_seq-1_ocr.txt    16679                           │
│                                                   La_Ragione/sn84037024_1917-04-25_ed-3_seq-1_ocr.txt    16793                           │
│                                                   La_Ragione/sn84037024_1917-04-25_ed-4_seq-1_ocr.txt    16235                           │
│                                                   La_Ragione/sn84037024_1917-05-05_ed-1_seq-1_ocr.txt    18346                           │
│                                                   La_Ragione/sn84037024_1917-05-05_ed-2_seq-1_ocr.txt    18474                           │
│                                                   La_Ragione/sn84037024_1917-05-05_ed-3_seq-1_ocr.txt    18280                           │
│                                                   La_Ragione/sn84037024_1917-05-05_ed-4_seq-1_ocr.txt    18481                           │
│                                                   La_Ragione/sn84037024_1917-05-16_ed-1_seq-1_ocr.txt    18620                           │
│                                                   La_Ragione/sn84037024_1917-05-16_ed-2_seq-1_ocr.txt    18698                           │
│                                                   La_Ragione/sn84037024_1917-05-16_ed-3_seq-1_ocr.txt    18540                           │
│                                                   La_Rassegna/sn84037025_1917-04-07_ed-1_seq-1_ocr.txt   19397                           │
│                                                   La_Rassegna/sn84037025_1917-04-14_ed-1_seq-1_ocr.txt   20647                           │
│                                                   La_Rassegna/sn84037025_1917-04-14_ed-2_seq-1_ocr.txt   20650                           │
│                                                   La_Rassegna/sn84037025_1917-04-21_ed-1_seq-1_ocr.txt   21017                           │
│                                                   La_Rassegna/sn84037025_1917-04-21_ed-2_seq-1_ocr.txt   20982                           │
│                                                                                                                                          │
│                                                                                                                                          │
│   text_corpus_table                                                                                                                      │
│                               id   rel_path                  mime_type    size    content                    file_name                   │
│                              ────────────────────────────────────────────────────────────────────────────────────────────────────────    │
│                               0    La_Ragione/sn84037024_1   text/plain   16613   LA RAGIONE                 sn84037024_1917-04-25_e     │
│                               1    La_Ragione/sn84037024_1   text/plain   16679   LA RAG ONE                 sn84037024_1917-04-25_e     │
│                               2    La_Ragione/sn84037024_1   text/plain   16793   LA RAGIONE                 sn84037024_1917-04-25_e     │
│                               3    La_Ragione/sn84037024_1   text/plain   16235   contro i vili, i camorri   sn84037024_1917-04-25_e     │
│                               4    La_Ragione/sn84037024_1   text/plain   18346   contro i vili, i camorri   sn84037024_1917-05-05_e     │
│                               5    La_Ragione/sn84037024_1   text/plain   18474   LA RAGIONA                 sn84037024_1917-05-05_e     │
│                               6    La_Ragione/sn84037024_1   text/plain   18280   LA RAGIONE                 sn84037024_1917-05-05_e     │
│                               7    La_Ragione/sn84037024_1   text/plain   18481   LA RAGIONE                 sn84037024_1917-05-05_e     │
│                               8    La_Ragione/sn84037024_1   text/plain   18620   contro i vili, i camorri   sn84037024_1917-05-16_e     │
│                               9    La_Ragione/sn84037024_1   text/plain   18698   LA RAG ONE                 sn84037024_1917-05-16_e     │
│                               10   La_Ragione/sn84037024_1   text/plain   18540   contro 1 vili, i camorri   sn84037024_1917-05-16_e     │
│                               11   La_Rassegna/sn84037025_   text/plain   19397   ■■■                        sn84037025_1917-04-07_e     │
│                               12   La_Rassegna/sn84037025_   text/plain   20647   La Rassegna                sn84037025_1917-04-14_e     │
│                               13   La_Rassegna/sn84037025_   text/plain   20650   Both Phones                sn84037025_1917-04-14_e     │
│                               14   La_Rassegna/sn84037025_   text/plain   21017   ■ jSrìt** W?? iIK 38®f-    sn84037025_1917-04-21_e     │
│                               15   La_Rassegna/sn84037025_   text/plain   20982   ■Both Phones               sn84037025_1917-04-21_e     │
│                                                                                                                                          │
│   tokenized_corpus                                                                                                                       │
│                               ['LA', 'RAGIONE', 'ORGANO', 'DI', 'DIFESA', 'DELLA', "ITALIANITÀ'", 'contro', '1 ...                       │
│                               ['LA', 'RAG', 'ONE', 'contro', 'i', 'vili', ',', 'i', 'camorristi', ',', 'i', 's ...                       │
│                               ['LA', 'RAGIONE', 'ORGANO', 'DI', 'DIFESA', 'DELLA', 'ITALIANITÀ', 'contro', 'i' ...                       │
│                               ['contro', 'i', 'vili', ',', 'i', 'camorristi', ',', 'i', 'sicari', ',', 'i', 'f ...                       │
│                               ['contro', 'i', 'vili', ',', 'i', 'camorristi', ',', 'i', 'sicari', ',', 'i', 'f ...                       │
│                               ['LA', 'RAGIONA', 'ORGANO', 'DI', 'DIFESA', 'DELLA', 'ITALIANITÀ', 'contro', 'i' ...                       │
│                               ['LA', 'RAGIONE', 'ORGANO', 'DI', 'DIFESA', 'DELLA', "ITALIANITÀ'", 'contro', 'i ...                       │
│                               ['LA', 'RAGIONE', 'contro', 'i', 'vili', ',', '1', 'camorristi', ',', 'i', 'sica ...                       │
│                               ['contro', 'i', 'vili', ',', 'i', 'camorristi', ',', 'i', 'sicari', ',', 'i', 'f ...                       │
│                               ['LA', 'RAG', 'ONE', 'ORGANO', 'DI', 'DIFESA', 'DELLA', 'ITALIANITÀ', "''", 'con ...                       │
│                               ['contro', '1', 'vili', ',', 'i', 'camorristi', ',', 'i', 'sicari', ',', 'i', 'f ...                       │
│                               ['■■■', 'La', 'Rassegna', '_', 'I', 'Both', 'Phones', 'ANNO', 'L', 'No', '.', '1 ...                       │
│                               ['La', 'Rassegna', 'Jjoth', 'Phones', 'ANNO', 'L', 'No', '.', '2', 'BASTA', '!', ...                       │
│                               ['Both', 'Phones', 'ANNO', 'I', '.', 'No', '.', '2', 'BASTA', '!', '...', 'uà',  ...                       │
│                               ['■', 'jSrìt', '*', '*', 'W', '?', '?', 'iIK', '38®f-', 'i^M', 'F', '<', '5É', ' ...                       │
│                               ['■Both', 'Phones', 'ANNO', '11', '.', 'No', '.', '5', 'LE', 'COSE', 'A', 'POSTO ...                       │
│                                                                                                                                          │
│   topic_models                                                                                                                           │
│                               dict data     {                                                                                            │
│                                               "7": [                                                                                     │
│                                                 [                                                                                        │
│                                                   0,                                                                                     │
│                                                   "0.085*\",\" + 0.028*\".\" + 0.012*\";\" + 0.006*\"'\" + 0.005*\"!\" + 0.004*\"''…     │
│                                                 ],                                                                                       │
│                                                 [                                                                                        │
│                                                   1,                                                                                     │
│                                                   "0.089*\",\" + 0.035*\".\" + 0.010*\";\" + 0.006*\"!\" + 0.006*\"'\" + 0.004*\":\…     │
│                                                 ],                                                                                       │
│                                                 [                                                                                        │
│                                                   2,                                                                                     │
│                                                   "0.101*\",\" + 0.031*\".\" + 0.012*\";\" + 0.006*\"'\" + 0.006*\"!\" + 0.005*\"*\…     │
│                                                 ],                                                                                       │
│                                                 [                                                                                        │
│                                                   3,                                                                                     │
│                                                   "0.067*\",\" + 0.032*\".\" + 0.013*\";\" + 0.007*\"'\" + 0.006*\"!\" + 0.004*\":\…     │
│                                                 ],                                                                                       │
│                                                 [                                                                                        │
│                                                   4,                                                                                     │
│                                                   "0.075*\",\" + 0.037*\".\" + 0.013*\";\" + 0.008*\"!\" + 0.006*\"'\" + 0.004*\"se…     │
│                                                 ],                                                                                       │
│                                                 [                                                                                        │
│                                                   5,                                                                                     │
│                                                   "0.073*\",\" + 0.030*\".\" + 0.014*\";\" + 0.007*\"!\" + 0.004*\"'\" + 0.003*\"pe…     │
│                                                 ],                                                                                       │
│                                                 [                                                                                        │
│                                                   6,                                                                                     │
│                                                   "0.069*\",\" + 0.026*\".\" + 0.007*\";\" + 0.007*\"!\" + 0.006*\"'\" + 0.004*\"''…     │
│                                                 ]                                                                                        │
│                                               ],                                                                                         │
│                                               "8": [                                                                                     │
│                                                 [                                                                                        │
│                                                   0,                                                                                     │
│                                                   "0.028*\",\" + 0.010*\".\" + 0.004*\";\" + 0.004*\"'\" + 0.003*\"!\" + 0.003*\":\…     │
│                                                 ],                                                                                       │
│                                                 [                                                                                        │
│                                                   1,                                                                                     │
│                                                   "0.069*\",\" + 0.025*\".\" + 0.013*\";\" + 0.008*\"!\" + 0.005*\"'\" + 0.004*\":\…     │
│                                                 ],                                                                                       │
│                                                 [                                                                                        │
│                                                   2,                                                                                     │
│                                                   "0.056*\",\" + 0.027*\".\" + 0.012*\";\" + 0.007*\"'\" + 0.005*\"!\" + 0.005*\":\…     │
│                                                 ],                                                                                       │
│                                                 [                                                                                        │
│                                                   3,                                                                                     │
│                                                   "0.110*\",\" + 0.040*\".\" + 0.012*\";\" + 0.006*\"!\" + 0.006*\"'\" + 0.005*\"*\…     │
│                                                 ],                                                                                       │
│                                                 [                                                                                        │
│                                                   4,                                                                                     │
│                                                   "0.029*\",\" + 0.017*\".\" + 0.006*\";\" + 0.004*\"!\" + 0.003*\"'\" + 0.002*\":\…     │
│                                                 ],                                                                                       │
│                                                 [                                                                                        │
│                                                   5,                                                                                     │
│                                                   "0.015*\",\" + 0.010*\".\" + 0.004*\";\" + 0.003*\"'\" + 0.002*\"!\" + 0.002*\"''…     │
│                                                 ],                                                                                       │
│                                                 [                                                                                        │
│                                                   6,                                                                                     │
│                                                   "0.057*\",\" + 0.033*\".\" + 0.013*\";\" + 0.008*\"!\" + 0.007*\"'\" + 0.005*\"se…     │
│                                                 ],                                                                                       │
│                                                 [                                                                                        │
│                                                   7,                                                                                     │
│                                                   "0.077*\",\" + 0.022*\".\" + 0.012*\";\" + 0.007*\"!\" + 0.006*\"'\" + 0.004*\"''…     │
│                                                 ]                                                                                        │
│                                               ],                                                                                         │
│                                               "9": [                                                                                     │
│                                                 [                                                                                        │
│                                                   0,                                                                                     │
│                                                   "0.064*\",\" + 0.023*\".\" + 0.010*\";\" + 0.006*\"!\" + 0.004*\"'\" + 0.004*\"*\…     │
│                                                 ],                                                                                       │
│                                                 [                                                                                        │
│                                                   1,                                                                                     │
│                                                   "0.049*\",\" + 0.032*\".\" + 0.009*\";\" + 0.005*\"'\" + 0.004*\"!\" + 0.003*\"``…     │
│                                                 ],                                                                                       │
│                                                 [                                                                                        │
│                                                   2,                                                                                     │
│                                                   "0.079*\",\" + 0.020*\".\" + 0.010*\";\" + 0.004*\":\" + 0.004*\"'\" + 0.004*\"se…     │
│                                                 ],                                                                                       │
│                                                 [                                                                                        │
│                                                   3,                                                                                     │
│                                                   "0.109*\",\" + 0.030*\".\" + 0.015*\";\" + 0.007*\"!\" + 0.007*\"'\" + 0.004*\":\…     │
│                                                 ],                                                                                       │
│                                                 [                                                                                        │
│                                                   4,                                                                                     │
│                                                   "0.051*\",\" + 0.030*\".\" + 0.010*\";\" + 0.005*\"''\" + 0.004*\"'\" + 0.004*\"!…     │
│                                                 ],                                                                                       │
│                                                 [                                                                                        │
│                                                   5,                                                                                     │
│                                                   "0.047*\",\" + 0.016*\".\" + 0.008*\";\" + 0.005*\"!\" + 0.003*\"''\" + 0.003*\"'…     │
│                                                 ],                                                                                       │
│                                                 [                                                                                        │
│                                                   6,                                                                                     │
│                                                   "0.074*\",\" + 0.030*\".\" + 0.010*\";\" + 0.006*\"!\" + 0.005*\"''\" + 0.005*\"'…     │
│                                                 ],                                                                                       │
│                                                 [                                                                                        │
│                                                   7,                                                                                     │
│                                                   "0.079*\",\" + 0.039*\".\" + 0.012*\";\" + 0.007*\"!\" + 0.006*\"'\" + 0.004*\":\…     │
│                                                 ],                                                                                       │
│                                                 [                                                                                        │
│                                                   8,                                                                                     │
│                                                   "0.094*\",\" + 0.041*\".\" + 0.012*\";\" + 0.007*\"'\" + 0.007*\"!\" + 0.004*\"*\…     │
│                                                 ]                                                                                        │
│                                               ]                                                                                          │
│                                             }                                                                                            │
│                               dict schema   {                                                                                            │
│                                               "title": "dict",                                                                           │
│                                               "type": "object"                                                                           │
│                                             }                                                                                            │
│                                                                                                                                          │
│                                                                                                                                          │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯

Workflow snapshot ¶

So far, our workflow only exists in memory. If we want to save it so we can have a look at it again at a later stage, we can snapshot the current state, which will save the current structure of the internal pipeline, as well as all inputs that are currently used. In addition, this will register the workflow under the alias we specified on top of this file when creating the Workflow object (in our case: topic_modeling).

If we would not not specify save=True, the structure of the pipeline and inputs would still be frozen and kept, but only in memory, and we'd only be able to access it in our current session.

workflow.snapshot(save=True)

╭──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│                                                                                                                                          │
│  state id         zdpuB2sHA2sfuLESmdATMSZm5Bi9TSJS697TGGVuJfvEqGrdT                                                                      │
│  pipeline inputs                                                                                                                         │
│                     field name                               status   required   default   value id                                      │
│                    ─────────────────────────────────────────────────────────────────────────────────────────────────────────────         │
│                     create_stopwords_list__languages         valid    no                   76394b45-2dcb-44ca-b22b-01e0ee25bd9f          │
│                     create_stopwords_list__stopword_lists    valid    no                   cfd4f58b-358d-4640-af08-b745ffaa7fd3          │
│                     import_text_corpus__path                 valid    yes                  8721e6c2-9215-4020-93aa-f2fa90eec13b          │
│                     extract_filename_column__column_name     valid    yes                  0b6bd257-60c6-4434-b66c-e855aa203622          │
│                     extract_texts_column__column_name        valid    yes                  fd3ef854-c982-4589-b5c4-112aae1c597e          │
│                     create_date_array__force_non_null        valid    no         True      00000000-0000-0000-0000-000000000002          │
│                     create_date_array__max_index             valid    no                   a3846cdb-68cb-461a-9635-d0e9bb73681e          │
│                     create_date_array__min_index             valid    no                   4e8b9a9d-80e0-4595-a244-400746a2215b          │
│                     create_date_array__remove_tokens         valid    no         []        00000000-0000-0000-0000-000000000002          │
│                     tokenize_content__tokenize_by_word       valid    no         True      00000000-0000-0000-0000-000000000002          │
│                     preprocess_corpus__remove_all_numeric    valid    no         False     00000000-0000-0000-0000-000000000002          │
│                     preprocess_corpus__remove_alphanumeric   valid    no         False     00000000-0000-0000-0000-000000000002          │
│                     preprocess_corpus__remove_non_alpha      valid    no         False     00000000-0000-0000-0000-000000000002          │
│                     preprocess_corpus__remove_short_tokens   valid    no         False     00000000-0000-0000-0000-000000000002          │
│                     preprocess_corpus__to_lowercase          valid    no         False     00000000-0000-0000-0000-000000000002          │
│                     generate_lda__compute_coherence          valid    no         False     0f2ecd3e-15a8-482b-b0e2-20b65d5d4f53          │
│                     generate_lda__num_topics_max             valid    no                   f9e07551-de8d-4e36-a9f6-cffc01d4c149          │
│                     generate_lda__num_topics_min             valid    no         7         220f512a-03c0-4693-bba7-73142b8522ae          │
│                     generate_lda__words_per_topic            valid    no         10        00000000-0000-0000-0000-000000000002          │
│                                                                                                                                          │
│  steps            steps                                                                                                                  │
│                   ├── stage: 1                                                                                                           │
│                   │   ├── step: create_stopwords_list                                                                                    │
│                   │   │   └── status: results ready                                                                                      │
│                   │   └── step: import_text_corpus                                                                                       │
│                   │       └── status: results ready                                                                                      │
│                   ├── stage: 2                                                                                                           │
│                   │   └── step: create_text_corpus                                                                                       │
│                   │       └── status: results ready                                                                                      │
│                   ├── stage: 3                                                                                                           │
│                   │   ├── step: extract_filename_column                                                                                  │
│                   │   │   └── status: results ready                                                                                      │
│                   │   └── step: extract_texts_column                                                                                     │
│                   │       └── status: results ready                                                                                      │
│                   ├── stage: 4                                                                                                           │
│                   │   ├── step: create_date_array                                                                                        │
│                   │   │   └── status: results ready                                                                                      │
│                   │   └── step: tokenize_content                                                                                         │
│                   │       └── status: results ready                                                                                      │
│                   ├── stage: 5                                                                                                           │
│                   │   └── step: preprocess_corpus                                                                                        │
│                   │       └── status: results ready                                                                                      │
│                   └── stage: 6                                                                                                           │
│                       └── step: generate_lda                                                                                             │
│                           └── status: results ready                                                                                      │
│  pipeline outputs                                                                                                                        │
│                     field name                              status   type          value id                                              │
│                    ─────────────────────────────────────────────────────────────────────────────────────────────────────                 │
│                     create_stopwords_list__stopwords_list   valid    list          c4c81256-c387-471e-8a05-76890eaefbcd                  │
│                     import_text_corpus__file_bundle         valid    file_bundle   9856156c-b8dd-46c4-820d-e7f4934f89c9                  │
│                     create_text_corpus__table               valid    table         640866c0-dac7-48f3-82d4-8a85617ca072                  │
│                     extract_filename_column__array          valid    array         b74e8402-ee7c-446f-8f3f-6fe9048d1e7e                  │
│                     extract_texts_column__array             valid    array         4b002cd8-2e48-456c-8cae-5479c5b1d612                  │
│                     create_date_array__date_array           valid    array         4abcfa2e-8ba0-4b04-8414-2a3e5ac38eea                  │
│                     tokenize_content__tokens_array          valid    array         e184c7e5-83c6-4821-ad3a-54b79afff9aa                  │
│                     preprocess_corpus__tokens_array         valid    array         685323a1-0db9-41b8-a518-c247a089e6ca                  │
│                     generate_lda__coherence_map             valid    dict          c746a2dd-a587-42c1-9967-9753789bf616                  │
│                     generate_lda__coherence_table           valid    table         5fd0a91d-7682-4b08-a0a9-f1eb361d60f0                  │
│                     generate_lda__topic_models              valid    dict          2c1ad51f-badf-4276-b368-d2e144e361bc                  │
│                                                                                                                                          │
│                                                                                                                                          │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯

Now, we can access our workflow in other environments, for example from the commandline:

! kiara workflow list



  alias(es)        workflow_id                            # steps   # stages   # states   description                                  
 ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 
  topic_modeling   1a0f56d8-bfb6-459a-b7ad-ad52ea1828ff   9         6          1          Example topic-modeling end-to-end workflow.

! kiara workflow explain topic_modeling

╭─ Workflow: topic_modeling ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│                                                                                                                                          │
│  documentation                                                                                                                           │
│                          Example topic-modeling end-to-end workflow.                                                                     │
│                                                                                                                                          │
│  author(s)                                                                                                                               │
│                                                                                                                                          │
│  workflow id           1a0f56d8-bfb6-459a-b7ad-ad52ea1828ff                                                                              │
│  context                                                                                                                                 │
│                                                                                                                                          │
│  current aliases       {"inputs":{"extract_texts_column__column_name":"content_column_name","extract_filename_column__column_name":"fi…  │
│  current inputs                                                                                                                          │
│                          field                     value                                                                                 │
│                         ──────────────────────────────────────────────────────────────────────────────────────────────────────────────   │
│                          compute_coherence         True                                                                                  │
│                          content_column_name       content                                                                               │
│                          date_force_non_null       -- none/not set --                                                                    │
│                          date_parse_max            21                                                                                    │
│                          date_parse_min            11                                                                                    │
│                          date_remove_tokensl       -- none/not set --                                                                    │
│                          filename_column_name      file_name                                                                             │
│                          languages                                                                                                       │
│                                                      Field          Type                 Value                 Description               │
│                                                     ────────────────────────────────────────────────────────────────────────────────     │
│                                                      item_schema    object               {                     The schema.               │
│                                                                                            "title": "list",                              │
│                                                                                            "type": "object"                              │
│                                                                                          }                                               │
│                                                                                                                                          │
│                                                      list_data      array                italian               The data.                 │
│                                                                                                                                          │
│                                                      python_class   -- check source --                         The python class of       │
│                                                                                            pytho…   list       which model               │
│                                                                                            pytho…   built…     instances are             │
│                                                                                            full_…   list       created. This is          │
│                                                                                                                mostly meant as a         │
│                                                                                                                hint for client           │
│                                                                                                                applications.             │
│                                                                                                                                          │
│                          num_topics_max            9                                                                                     │
│                          num_topics_min            7                                                                                     │
│                          remove_all_numeric        -- none/not set --                                                                    │
│                          remove_alphanumeric       -- none/not set --                                                                    │
│                          remove_non_alpha          -- none/not set --                                                                    │
│                          remove_short_tokens       -- none/not set --                                                                    │
│                          stopword_lists                                                                                                  │
│                                                      Field          Type                 Value                 Description               │
│                                                     ────────────────────────────────────────────────────────────────────────────────     │
│                                                      item_schema    object               {                     The schema.               │
│                                                                                            "title": "list",                              │
│                                                                                            "type": "object"                              │
│                                                                                          }                                               │
│                                                                                                                                          │
│                                                      list_data      array                                      The data.                 │
│                                                                                                                                          │
│                                                      python_class   -- check source --                         The python class of       │
│                                                                                            pytho…   list       which model               │
│                                                                                            pytho…   built…     instances are             │
│                                                                                            full_…   list       created. This is          │
│                                                                                                                mostly meant as a         │
│                                                                                                                hint for client           │
│                                                                                                                applications.             │
│                                                                                                                                          │
│                          text_corpus_folder_path   /home/markus/projects/kiara/dev/kiara.examples/examples/pipelines/topic_modeling/…    │
│                          to_lowercase              -- none/not set --                                                                    │
│                          tokenize_by_word          -- none/not set --                                                                    │
│                          words_per_topic           -- none/not set --                                                                    │
│                                                                                                                                          │
│  current outputs                                                                                                                         │
│                          field                     value                                                                                 │
│                         ──────────────────────────────────────────────────────────────────────────────────────────────────────────────   │
│                          coherence_map                                                                                                   │
│                                                      dict data     {                                                                     │
│                                                                      "7": 0.23418388139804186,                                           │
│                                                                      "8": 0.2320449393569416,                                            │
│                                                                      "9": 0.232008834255219                                              │
│                                                                    }                                                                     │
│                                                      dict schema   {                                                                     │
│                                                                      "title": "dict",                                                    │
│                                                                      "type": "object"                                                    │
│                                                                    }                                                                     │
│                                                                                                                                          │
│                          coherence_table                                                                                                 │
│                                                      topic_id   words                                                    num_topics      │
│                                                     ────────────────────────────────────────────────────────────────────────────────     │
│                                                      1          sempre, j, perchè, 1, degenerato, mente, mai, no, solo   7               │
│                                                      2          no, j, re, perchè, to, 1, sempre, te, egli, degenerato   7               │
│                                                      3          perchè, 1, sempre, no, j, to, degenerato, egli, te, re   7               │
│                                                      4          sempre, j, no, altri, perchè, te, 1, degenerato, solo,   7               │
│                                                      5          sempre, perchè, j, no, degenerato, te, solo, 1, egli,    7               │
│                                                      6          perchè, degenerato, egli, no, j, re, 1, sempre, fatto,   7               │
│                                                      7          j, sempre, degenerato, 1, egli, no, to, oggi, te, re,    7               │
│                                                      1          sempre, perchè, altri, 1, j, te, no, Figli, solo, re,    8               │
│                                                      2          j, sempre, degenerato, te, re, to, 1, perchè, mai, ta,   8               │
│                                                      3          sempre, 1, j, perchè, no, degenerato, fare, re, altri,   8               │
│                                                      4          no, perchè, sempre, j, degenerato, 1, egli, to, re, te   8               │
│                                                      5          sempre, perchè, degenerato, 1, egli, j, re, mai, altri   8               │
│                                                      6          sempre, altri, solo, to, j, perchè, mente, 1, re, Stat   8               │
│                                                      7          sempre, j, re, perchè, degenerato, 1, egli, no, te, to   8               │
│                                                      8          sempre, perchè, te, 1, j, re, egli, degenerato, no, to   8               │
│                                                      1          sempre, perchè, j, no, altri, degenerato, re, te, 1, m   9               │
│                                                      2          no, perchè, sempre, j, 1, degenerato, to, egli, oggi,    9               │
│                                                      3          sempre, te, perchè, j, no, to, 1, egli, altri, re, men   9               │
│                                                      4          sempre, j, degenerato, no, 1, re, te, perchè, fare, fa   9               │
│                                                      5          sempre, no, perchè, 1, j, altri, mente, to, fatto, deg   9               │
│                                                      6          sempre, egli, j, 1, no, degenerato, perchè, te, essi,    9               │
│                                                      7          sempre, to, perchè, j, 1, mente, altri, re, solo, essi   9               │
│                                                      8          sempre, perchè, egli, no, j, 1, degenerato, re, te, du   9               │
│                                                      9          perchè, j, sempre, no, degenerato, egli, 1, to, re, te   9               │
│                                                                                                                                          │
│                          content_array                                                                                                   │
│                                                      LA RAGIONE                                                                          │
│                                                      LA RAG ONE                                                                          │
│                                                      LA RAGIONE                                                                          │
│                                                      contro i vili, i camorristi, i sicari, i falsari e gli austriacanti, nemici de      │
│                                                      contro i vili, i camorristi, i sicari, i falsari e gli austriacanti, nemici de      │
│                                                      LA RAGIONA                                                                          │
│                                                      LA RAGIONE                                                                          │
│                                                      LA RAGIONE                                                                          │
│                                                      contro i vili, i camorristi, i sicari, i falsari e gli austriacanti, nemici de      │
│                                                      LA RAG ONE                                                                          │
│                                                      contro 1 vili, i camorristi, i sicari, i falsari e gli austriacanti, nemici de      │
│                                                      ■■■                                                                                 │
│                                                      La Rassegna                                                                         │
│                                                      Both Phones                                                                         │
│                                                      ■ jSrìt** W?? iIK 38®f- i^M                                                         │
│                                                      ■Both Phones                                                                        │
│                                                                                                                                          │
│                          date_array                                                                                                      │
│                                                      1917-04-25 00:00:00                                                                 │
│                                                      1917-04-25 00:00:00                                                                 │
│                                                      1917-04-25 00:00:00                                                                 │
│                                                      1917-04-25 00:00:00                                                                 │
│                                                      1917-05-05 00:00:00                                                                 │
│                                                      1917-05-05 00:00:00                                                                 │
│                                                      1917-05-05 00:00:00                                                                 │
│                                                      1917-05-05 00:00:00                                                                 │
│                                                      1917-05-16 00:00:00                                                                 │
│                                                      1917-05-16 00:00:00                                                                 │
│                                                      1917-05-16 00:00:00                                                                 │
│                                                      1917-04-07 00:00:00                                                                 │
│                                                      1917-04-14 00:00:00                                                                 │
│                                                      1917-04-14 00:00:00                                                                 │
│                                                      1917-04-21 00:00:00                                                                 │
│                                                      1917-04-21 00:00:00                                                                 │
│                                                                                                                                          │
│                          preprocessed_corpus                                                                                             │
│                                                      ['RAGIONE', 'ORGANO', 'DIFESA', "ITALIANITÀ'", '1', 'vili', ',', 'camorristi',      │
│                                                      ['RAG', 'ONE', 'vili', ',', 'camorristi', ',', 'sicari', ',', 'falsari', 'aust      │
│                                                      ['RAGIONE', 'ORGANO', 'DIFESA', 'ITALIANITÀ', 'vili', ',', 'camorristi', ',',       │
│                                                      ['vili', ',', 'camorristi', ',', 'sicari', ',', 'falsari', 'austriacanti', ','      │
│                                                      ['vili', ',', 'camorristi', ',', 'sicari', ',', 'falsari', 'austriacanti', ','      │
│                                                      ['RAGIONA', 'ORGANO', 'DIFESA', 'ITALIANITÀ', 'vili', ',', 'camorristi', ',',       │
│                                                      ['RAGIONE', 'ORGANO', 'DIFESA', "ITALIANITÀ'", 'vili', ',', 'camorristi', ',',      │
│                                                      ['RAGIONE', 'vili', ',', '1', 'camorristi', ',', 'sicari', ',', 'falsari', 'au      │
│                                                      ['vili', ',', 'camorristi', ',', 'sicari', ',', 'falsari', 'austriacanti', ','      │
│                                                      ['RAG', 'ONE', 'ORGANO', 'DIFESA', 'ITALIANITÀ', "''", 'vili', ',', 'camorrist      │
│                                                      ['1', 'vili', ',', 'camorristi', ',', 'sicari', ',', 'falsari', 'austriacanti'      │
│                                                      ['■■■', 'Rassegna', '_', 'Both', 'Phones', 'ANNO', 'No', '.', '1', 'perche', "      │
│                                                      ['Rassegna', 'Jjoth', 'Phones', 'ANNO', 'No', '.', '2', 'BASTA', '!', '...', '      │
│                                                      ['Both', 'Phones', 'ANNO', '.', 'No', '.', '2', 'BASTA', '!', '...', 'uà', 'qu      │
│                                                      ['■', 'jSrìt', '*', '*', 'W', '?', '?', 'iIK', '38®f-', 'i^M', 'F', '<', '5É',      │
│                                                      ['■Both', 'Phones', 'ANNO', '11', '.', 'No', '.', '5', 'COSE', 'POSTO', 'va',       │
│                                                                                                                                          │
│                          text_corpus_file_bundle                                                                                         │
│                                                      bundle name       data                                                              │
│                                                      number_of_files   16                                                                │
│                                                      size              298452                                                            │
│                                                      included files                                                                      │
│                                                                          (relative) path                                    size         │
│                                                                         ──────────────────────────────────────────────────────────       │
│                                                                          La_Ragione/sn84037024_1917-04-25_ed-1_seq-1_ocr…   16613        │
│                                                                          La_Ragione/sn84037024_1917-04-25_ed-2_seq-1_ocr…   16679        │
│                                                                          La_Ragione/sn84037024_1917-04-25_ed-3_seq-1_ocr…   16793        │
│                                                                          La_Ragione/sn84037024_1917-04-25_ed-4_seq-1_ocr…   16235        │
│                                                                          La_Ragione/sn84037024_1917-05-05_ed-1_seq-1_ocr…   18346        │
│                                                                          La_Ragione/sn84037024_1917-05-05_ed-2_seq-1_ocr…   18474        │
│                                                                          La_Ragione/sn84037024_1917-05-05_ed-3_seq-1_ocr…   18280        │
│                                                                          La_Ragione/sn84037024_1917-05-05_ed-4_seq-1_ocr…   18481        │
│                                                                          La_Ragione/sn84037024_1917-05-16_ed-1_seq-1_ocr…   18620        │
│                                                                          La_Ragione/sn84037024_1917-05-16_ed-2_seq-1_ocr…   18698        │
│                                                                          La_Ragione/sn84037024_1917-05-16_ed-3_seq-1_ocr…   18540        │
│                                                                          La_Rassegna/sn84037025_1917-04-07_ed-1_seq-1_oc…   19397        │
│                                                                          La_Rassegna/sn84037025_1917-04-14_ed-1_seq-1_oc…   20647        │
│                                                                          La_Rassegna/sn84037025_1917-04-14_ed-2_seq-1_oc…   20650        │
│                                                                          La_Rassegna/sn84037025_1917-04-21_ed-1_seq-1_oc…   21017        │
│                                                                          La_Rassegna/sn84037025_1917-04-21_ed-2_seq-1_oc…   20982        │
│                                                                                                                                          │
│                                                                                                                                          │
│                          text_corpus_table                                                                                               │
│                                                      id   rel_path          mime_type    size    content            file_name            │
│                                                     ────────────────────────────────────────────────────────────────────────────────     │
│                                                      0    La_Ragione/sn84   text/plain   16613   LA RAGIONE         sn84037024_1917      │
│                                                      1    La_Ragione/sn84   text/plain   16679   LA RAG ONE         sn84037024_1917      │
│                                                      2    La_Ragione/sn84   text/plain   16793   LA RAGIONE         sn84037024_1917      │
│                                                      3    La_Ragione/sn84   text/plain   16235   contro i vili, i   sn84037024_1917      │
│                                                      4    La_Ragione/sn84   text/plain   18346   contro i vili, i   sn84037024_1917      │
│                                                      5    La_Ragione/sn84   text/plain   18474   LA RAGIONA         sn84037024_1917      │
│                                                      6    La_Ragione/sn84   text/plain   18280   LA RAGIONE         sn84037024_1917      │
│                                                      7    La_Ragione/sn84   text/plain   18481   LA RAGIONE         sn84037024_1917      │
│                                                      8    La_Ragione/sn84   text/plain   18620   contro i vili, i   sn84037024_1917      │
│                                                      9    La_Ragione/sn84   text/plain   18698   LA RAG ONE         sn84037024_1917      │
│                                                      10   La_Ragione/sn84   text/plain   18540   contro 1 vili, i   sn84037024_1917      │
│                                                      11   La_Rassegna/sn8   text/plain   19397   ■■■                sn84037025_1917      │
│                                                      12   La_Rassegna/sn8   text/plain   20647   La Rassegna        sn84037025_1917      │
│                                                      13   La_Rassegna/sn8   text/plain   20650   Both Phones        sn84037025_1917      │
│                                                      14   La_Rassegna/sn8   text/plain   21017   ■ jSrìt** W?? iI   sn84037025_1917      │
│                                                      15   La_Rassegna/sn8   text/plain   20982   ■Both Phones       sn84037025_1917      │
│                                                                                                                                          │
│                          tokenized_corpus                                                                                                │
│                                                      ['LA', 'RAGIONE', 'ORGANO', 'DI', 'DIFESA', 'DELLA', "ITALIANITÀ'", 'contro',       │
│                                                      ['LA', 'RAG', 'ONE', 'contro', 'i', 'vili', ',', 'i', 'camorristi', ',', 'i',       │
│                                                      ['LA', 'RAGIONE', 'ORGANO', 'DI', 'DIFESA', 'DELLA', 'ITALIANITÀ', 'contro', '      │
│                                                      ['contro', 'i', 'vili', ',', 'i', 'camorristi', ',', 'i', 'sicari', ',', 'i',       │
│                                                      ['contro', 'i', 'vili', ',', 'i', 'camorristi', ',', 'i', 'sicari', ',', 'i',       │
│                                                      ['LA', 'RAGIONA', 'ORGANO', 'DI', 'DIFESA', 'DELLA', 'ITALIANITÀ', 'contro', '      │
│                                                      ['LA', 'RAGIONE', 'ORGANO', 'DI', 'DIFESA', 'DELLA', "ITALIANITÀ'", 'contro',       │
│                                                      ['LA', 'RAGIONE', 'contro', 'i', 'vili', ',', '1', 'camorristi', ',', 'i', 'si      │
│                                                      ['contro', 'i', 'vili', ',', 'i', 'camorristi', ',', 'i', 'sicari', ',', 'i',       │
│                                                      ['LA', 'RAG', 'ONE', 'ORGANO', 'DI', 'DIFESA', 'DELLA', 'ITALIANITÀ', "''", 'c      │
│                                                      ['contro', '1', 'vili', ',', 'i', 'camorristi', ',', 'i', 'sicari', ',', 'i',       │
│                                                      ['■■■', 'La', 'Rassegna', '_', 'I', 'Both', 'Phones', 'ANNO', 'L', 'No', '.',       │
│                                                      ['La', 'Rassegna', 'Jjoth', 'Phones', 'ANNO', 'L', 'No', '.', '2', 'BASTA', '!      │
│                                                      ['Both', 'Phones', 'ANNO', 'I', '.', 'No', '.', '2', 'BASTA', '!', '...', 'uà'      │
│                                                      ['■', 'jSrìt', '*', '*', 'W', '?', '?', 'iIK', '38®f-', 'i^M', 'F', '<', '5É',      │
│                                                      ['■Both', 'Phones', 'ANNO', '11', '.', 'No', '.', '5', 'LE', 'COSE', 'A', 'POS      │
│                                                                                                                                          │
│                          topic_models                                                                                                    │
│                                                      dict data     {                                                                     │
│                                                                      "7": [                                                              │
│                                                                        [                                                                 │
│                                                                          0,                                                              │
│                                                                          "0.085*\",\" + 0.028*\".\" + 0.012*\";\" + 0.006*\"'\" + …      │
│                                                                        ],                                                                │
│                                                                        [                                                                 │
│                                                                          1,                                                              │
│                                                                          "0.089*\",\" + 0.035*\".\" + 0.010*\";\" + 0.006*\"!\" + …      │
│                                                                        ],                                                                │
│                                                                        [                                                                 │
│                                                                          2,                                                              │
│                                                                          "0.101*\",\" + 0.031*\".\" + 0.012*\";\" + 0.006*\"'\" + …      │
│                                                                        ],                                                                │
│                                                                        [                                                                 │
│                                                                          3,                                                              │
│                                                                          "0.067*\",\" + 0.032*\".\" + 0.013*\";\" + 0.007*\"'\" + …      │
│                                                                        ],                                                                │
│                                                                        [                                                                 │
│                                                                          4,                                                              │
│                                                                          "0.075*\",\" + 0.037*\".\" + 0.013*\";\" + 0.008*\"!\" + …      │
│                                                                        ],                                                                │
│                                                                        [                                                                 │
│                                                                          5,                                                              │
│                                                                          "0.073*\",\" + 0.030*\".\" + 0.014*\";\" + 0.007*\"!\" + …      │
│                                                                        ],                                                                │
│                                                                        [                                                                 │
│                                                                          6,                                                              │
│                                                                          "0.069*\",\" + 0.026*\".\" + 0.007*\";\" + 0.007*\"!\" + …      │
│                                                                        ]                                                                 │
│                                                                      ],                                                                  │
│                                                                      "8": [                                                              │
│                                                                        [                                                                 │
│                                                                          0,                                                              │
│                                                                          "0.028*\",\" + 0.010*\".\" + 0.004*\";\" + 0.004*\"'\" + …      │
│                                                                        ],                                                                │
│                                                                        [                                                                 │
│                                                                          1,                                                              │
│                                                                          "0.069*\",\" + 0.025*\".\" + 0.013*\";\" + 0.008*\"!\" + …      │
│                                                                        ],                                                                │
│                                                                        [                                                                 │
│                                                                          2,                                                              │
│                                                                          "0.056*\",\" + 0.027*\".\" + 0.012*\";\" + 0.007*\"'\" + …      │
│                                                                        ],                                                                │
│                                                                        [                                                                 │
│                                                                          3,                                                              │
│                                                                          "0.110*\",\" + 0.040*\".\" + 0.012*\";\" + 0.006*\"!\" + …      │
│                                                                        ],                                                                │
│                                                                        [                                                                 │
│                                                                          4,                                                              │
│                                                                          "0.029*\",\" + 0.017*\".\" + 0.006*\";\" + 0.004*\"!\" + …      │
│                                                                        ],                                                                │
│                                                                        [                                                                 │
│                                                                          5,                                                              │
│                                                                          "0.015*\",\" + 0.010*\".\" + 0.004*\";\" + 0.003*\"'\" + …      │
│                                                                        ],                                                                │
│                                                                        [                                                                 │
│                                                                          6,                                                              │
│                                                                          "0.057*\",\" + 0.033*\".\" + 0.013*\";\" + 0.008*\"!\" + …      │
│                                                                        ],                                                                │
│                                                                        [                                                                 │
│                                                                          7,                                                              │
│                                                                          "0.077*\",\" + 0.022*\".\" + 0.012*\";\" + 0.007*\"!\" + …      │
│                                                                        ]                                                                 │
│                                                                      ],                                                                  │
│                                                                      "9": [                                                              │
│                                                                        [                                                                 │
│                                                                          0,                                                              │
│                                                                          "0.064*\",\" + 0.023*\".\" + 0.010*\";\" + 0.006*\"!\" + …      │
│                                                                        ],                                                                │
│                                                                        [                                                                 │
│                                                                          1,                                                              │
│                                                                          "0.049*\",\" + 0.032*\".\" + 0.009*\";\" + 0.005*\"'\" + …      │
│                                                                        ],                                                                │
│                                                                        [                                                                 │
│                                                                          2,                                                              │
│                                                                          "0.079*\",\" + 0.020*\".\" + 0.010*\";\" + 0.004*\":\" + …      │
│                                                                        ],                                                                │
│                                                                        [                                                                 │
│                                                                          3,                                                              │
│                                                                          "0.109*\",\" + 0.030*\".\" + 0.015*\";\" + 0.007*\"!\" + …      │
│                                                                        ],                                                                │
│                                                                        [                                                                 │
│                                                                          4,                                                              │
│                                                                          "0.051*\",\" + 0.030*\".\" + 0.010*\";\" + 0.005*\"''\" +…      │
│                                                                        ],                                                                │
│                                                                        [                                                                 │
│                                                                          5,                                                              │
│                                                                          "0.047*\",\" + 0.016*\".\" + 0.008*\";\" + 0.005*\"!\" + …      │
│                                                                        ],                                                                │
│                                                                        [                                                                 │
│                                                                          6,                                                              │
│                                                                          "0.074*\",\" + 0.030*\".\" + 0.010*\";\" + 0.006*\"!\" + …      │
│                                                                        ],                                                                │
│                                                                        [                                                                 │
│                                                                          7,                                                              │
│                                                                          "0.079*\",\" + 0.039*\".\" + 0.012*\";\" + 0.007*\"!\" + …      │
│                                                                        ],                                                                │
│                                                                        [                                                                 │
│                                                                          8,                                                              │
│                                                                          "0.094*\",\" + 0.041*\".\" + 0.012*\";\" + 0.007*\"'\" + …      │
│                                                                        ]                                                                 │
│                                                                      ]                                                                   │
│                                                                    }                                                                     │
│                                                      dict schema   {                                                                     │
│                                                                      "title": "dict",                                                    │
│                                                                      "type": "object"                                                    │
│                                                                    }                                                                     │
│                                                                                                                                          │
│                                                                                                                                          │
│  snapshot timeline                                                                                                                       │
│                          2022-10-03 13:07:56.228937+00:00   zdpuB2sHA2sfuLESmdATMSZm5Bi9TSJS697TGGVuJfvEqGrdT                            │
│                                                                                                                                          │
│  current state id      zdpuB2sHA2sfuLESmdATMSZm5Bi9TSJS697TGGVuJfvEqGrdT                                                                 │
│  current state details                                                                                                                   │
│                         pipeline inputs                                                                                                  │
│                                            field name                     status   required   default   value id                         │
│                                           ───────────────────────────────────────────────────────────────────────────────────────────    │
│                                            create_stopwords_list__lang…   valid    no                   76394b45-2dcb-44ca-b22b-01e0     │
│                                                                                                         ee25bd9f                         │
│                                            create_stopwords_list__stop…   valid    no                   cfd4f58b-358d-4640-af08-b745     │
│                                                                                                         ffaa7fd3                         │
│                                            import_text_corpus__path       valid    yes                  8721e6c2-9215-4020-93aa-f2fa     │
│                                                                                                         90eec13b                         │
│                                            extract_filename_column__co…   valid    yes                  0b6bd257-60c6-4434-b66c-e855     │
│                                                                                                         aa203622                         │
│                                            extract_texts_column__colum…   valid    yes                  fd3ef854-c982-4589-b5c4-112a     │
│                                                                                                         ae1c597e                         │
│                                            create_date_array__force_no…   valid    no         True      00000000-0000-0000-0000-0000     │
│                                                                                                         00000002                         │
│                                            create_date_array__max_index   valid    no                   a3846cdb-68cb-461a-9635-d0e9     │
│                                                                                                         bb73681e                         │
│                                            create_date_array__min_index   valid    no                   4e8b9a9d-80e0-4595-a244-4007     │
│                                                                                                         46a2215b                         │
│                                            create_date_array__remove_t…   valid    no         []        00000000-0000-0000-0000-0000     │
│                                                                                                         00000002                         │
│                                            tokenize_content__tokenize_…   valid    no         True      00000000-0000-0000-0000-0000     │
│                                                                                                         00000002                         │
│                                            preprocess_corpus__remove_a…   valid    no         False     00000000-0000-0000-0000-0000     │
│                                                                                                         00000002                         │
│                                            preprocess_corpus__remove_a…   valid    no         False     00000000-0000-0000-0000-0000     │
│                                                                                                         00000002                         │
│                                            preprocess_corpus__remove_n…   valid    no         False     00000000-0000-0000-0000-0000     │
│                                                                                                         00000002                         │
│                                            preprocess_corpus__remove_s…   valid    no         False     00000000-0000-0000-0000-0000     │
│                                                                                                         00000002                         │
│                                            preprocess_corpus__to_lower…   valid    no         False     00000000-0000-0000-0000-0000     │
│                                                                                                         00000002                         │
│                                            generate_lda__compute_coher…   valid    no         False     0f2ecd3e-15a8-482b-b0e2-20b6     │
│                                                                                                         5d5d4f53                         │
│                                            generate_lda__num_topics_max   valid    no                   f9e07551-de8d-4e36-a9f6-cffc     │
│                                                                                                         01d4c149                         │
│                                            generate_lda__num_topics_min   valid    no         7         220f512a-03c0-4693-bba7-7314     │
│                                                                                                         2b8522ae                         │
│                                            generate_lda__words_per_top…   valid    no         10        00000000-0000-0000-0000-0000     │
│                                                                                                         00000002                         │
│                                                                                                                                          │
│                         steps            steps                                                                                           │
│                                          ├── stage: 1                                                                                    │
│                                          │   ├── step: create_stopwords_list                                                             │
│                                          │   │   └── status: results ready                                                               │
│                                          │   └── step: import_text_corpus                                                                │
│                                          │       └── status: results ready                                                               │
│                                          ├── stage: 2                                                                                    │
│                                          │   └── step: create_text_corpus                                                                │
│                                          │       └── status: results ready                                                               │
│                                          ├── stage: 3                                                                                    │
│                                          │   ├── step: extract_filename_column                                                           │
│                                          │   │   └── status: results ready                                                               │
│                                          │   └── step: extract_texts_column                                                              │
│                                          │       └── status: results ready                                                               │
│                                          ├── stage: 4                                                                                    │
│                                          │   ├── step: create_date_array                                                                 │
│                                          │   │   └── status: results ready                                                               │
│                                          │   └── step: tokenize_content                                                                  │
│                                          │       └── status: results ready                                                               │
│                                          ├── stage: 5                                                                                    │
│                                          │   └── step: preprocess_corpus                                                                 │
│                                          │       └── status: results ready                                                               │
│                                          └── stage: 6                                                                                    │
│                                              └── step: generate_lda                                                                      │
│                                                  └── status: results ready                                                               │
│                         pipeline outputs                                                                                                 │
│                                            field name                         status   type          value id                            │
│                                           ───────────────────────────────────────────────────────────────────────────────────────────    │
│                                            create_stopwords_list__stopword…   valid    list          c4c81256-c387-471e-8a05-76890ea     │
│                                                                                                      efbcd                               │
│                                            import_text_corpus__file_bundle    valid    file_bundle   9856156c-b8dd-46c4-820d-e7f4934     │
│                                                                                                      f89c9                               │
│                                            create_text_corpus__table          valid    table         640866c0-dac7-48f3-82d4-8a85617     │
│                                                                                                      ca072                               │
│                                            extract_filename_column__array     valid    array         b74e8402-ee7c-446f-8f3f-6fe9048     │
│                                                                                                      d1e7e                               │
│                                            extract_texts_column__array        valid    array         4b002cd8-2e48-456c-8cae-5479c5b     │
│                                                                                                      1d612                               │
│                                            create_date_array__date_array      valid    array         4abcfa2e-8ba0-4b04-8414-2a3e5ac     │
│                                                                                                      38eea                               │
│                                            tokenize_content__tokens_array     valid    array         e184c7e5-83c6-4821-ad3a-54b79af     │
│                                                                                                      ff9aa                               │
│                                            preprocess_corpus__tokens_array    valid    array         685323a1-0db9-41b8-a518-c247a08     │
│                                                                                                      9e6ca                               │
│                                            generate_lda__coherence_map        valid    dict          c746a2dd-a587-42c1-9967-9753789     │
│                                                                                                      bf616                               │
│                                            generate_lda__coherence_table      valid    table         5fd0a91d-7682-4b08-a0a9-f1eb361     │
│                                                                                                      d60f0                               │
│                                            generate_lda__topic_models         valid    dict          2c1ad51f-badf-4276-b368-d2e144e     │
│                                                                                                      361bc                               │
│                                                                                                                                          │
│                                                                                                                                          │
│                                                                                                                                          │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯