Skip to content

tokens

Attributes

log = structlog.getLogger() module-attribute

Classes

TokenizeTextConfig

Bases: KiaraModuleConfig

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/language_processing/modules/tokens.py
51
52
53
54
55
56
57
58
59
class TokenizeTextConfig(KiaraModuleConfig):

    filter_non_alpha: bool = Field(
        description="Whether to filter out non alpha tokens.", default=True
    )
    min_token_length: int = Field(description="The minimum token length.", default=3)
    to_lowercase: bool = Field(
        description="Whether to lowercase the tokens.", default=True
    )

Attributes

filter_non_alpha: bool = Field(description='Whether to filter out non alpha tokens.', default=True) class-attribute instance-attribute
min_token_length: int = Field(description='The minimum token length.', default=3) class-attribute instance-attribute
to_lowercase: bool = Field(description='Whether to lowercase the tokens.', default=True) class-attribute instance-attribute

TokenizeTextModule

Bases: KiaraModule

Tokenize a string.

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/language_processing/modules/tokens.py
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
class TokenizeTextModule(KiaraModule):
    """Tokenize a string."""

    _config_cls = TokenizeTextConfig
    _module_type_name = "tokenize.string"

    def create_inputs_schema(
        self,
    ) -> ValueSetSchema:

        inputs = {"text": {"type": "string", "doc": "The text to tokenize."}}

        return inputs

    def create_outputs_schema(
        self,
    ) -> ValueSetSchema:

        outputs = {
            "token_list": {
                "type": "list",
                "doc": "The tokenized version of the input text.",
            }
        }
        return outputs

    def process(self, inputs: ValueMap, outputs: ValueMap) -> None:

        import nltk

        get_stopwords()

        # TODO: module-independent caching?
        # language = inputs.get_value_data("language")
        #
        text = inputs.get_value_data("text")
        tokenized = nltk.word_tokenize(text)

        result = tokenized
        if self.get_config_value("min_token_length") > 0:
            result = (
                x
                for x in tokenized
                if len(x) >= self.get_config_value("min_token_length")
            )

        if self.get_config_value("filter_non_alpha"):
            result = (x for x in result if x.isalpha())

        if self.get_config_value("to_lowercase"):
            result = (x.lower() for x in result)

        outputs.set_value("token_list", list(result))

Attributes

_config_cls = TokenizeTextConfig class-attribute instance-attribute

Functions

create_inputs_schema() -> ValueSetSchema
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/language_processing/modules/tokens.py
68
69
70
71
72
73
74
def create_inputs_schema(
    self,
) -> ValueSetSchema:

    inputs = {"text": {"type": "string", "doc": "The text to tokenize."}}

    return inputs
create_outputs_schema() -> ValueSetSchema
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/language_processing/modules/tokens.py
76
77
78
79
80
81
82
83
84
85
86
def create_outputs_schema(
    self,
) -> ValueSetSchema:

    outputs = {
        "token_list": {
            "type": "list",
            "doc": "The tokenized version of the input text.",
        }
    }
    return outputs
process(inputs: ValueMap, outputs: ValueMap) -> None
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/language_processing/modules/tokens.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def process(self, inputs: ValueMap, outputs: ValueMap) -> None:

    import nltk

    get_stopwords()

    # TODO: module-independent caching?
    # language = inputs.get_value_data("language")
    #
    text = inputs.get_value_data("text")
    tokenized = nltk.word_tokenize(text)

    result = tokenized
    if self.get_config_value("min_token_length") > 0:
        result = (
            x
            for x in tokenized
            if len(x) >= self.get_config_value("min_token_length")
        )

    if self.get_config_value("filter_non_alpha"):
        result = (x for x in result if x.isalpha())

    if self.get_config_value("to_lowercase"):
        result = (x.lower() for x in result)

    outputs.set_value("token_list", list(result))

TokenizeTextArrayeModule

Bases: KiaraModule

Split sentences into words or words into characters. In other words, this operation establishes the word boundaries (i.e., tokens) a very helpful way of finding patterns. It is also the typical step prior to stemming and lemmatization

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/language_processing/modules/tokens.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
class TokenizeTextArrayeModule(KiaraModule):
    """Split sentences into words or words into characters.
    In other words, this operation establishes the word boundaries (i.e., tokens) a very helpful way of finding patterns. It is also the typical step prior to stemming and lemmatization
    """

    _module_type_name = "tokenize.texts_array"

    KIARA_METADATA = {
        "tags": ["tokenize", "tokens"],
    }

    def create_inputs_schema(
        self,
    ) -> ValueSetSchema:

        return {
            "texts_array": {
                "type": "array",
                "doc": "An array of text items to be tokenized.",
            },
            "tokenize_by_word": {
                "type": "boolean",
                "doc": "Whether to tokenize by word (default), or character.",
                "default": True,
            },
        }

    def create_outputs_schema(
        self,
    ) -> ValueSetSchema:

        return {
            "tokens_array": {
                "type": "array",
                "doc": "The tokenized content, as an array of lists of strings.",
            }
        }

    def process(self, inputs: ValueMap, outputs: ValueMap):

        import nltk
        import polars as pl
        import pyarrow as pa

        get_stopwords()

        array: KiaraArray = inputs.get_value_data("texts_array")

        # for text in array.arrow_array:
        #     print("----")
        #     print(len(str(text)))
        tokenize_by_word: bool = inputs.get_value_data("tokenize_by_word")

        if not tokenize_by_word:
            raise KiaraProcessingException(
                "Non-word tokenization is not yet implemented."
            )

        column: pa.ChunkedArray = array.arrow_array  # type: ignore

        # warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
        def word_tokenize(word):
            return nltk.word_tokenize(word)

        series = pl.Series(name="tokens", values=column)
        result = series.apply(word_tokenize)
        result_array = result.to_arrow()

        # TODO: remove this cast once the array data type can handle non-chunked arrays
        chunked = pa.chunked_array(result_array)
        outputs.set_values(tokens_array=chunked)

Attributes

KIARA_METADATA = {'tags': ['tokenize', 'tokens']} class-attribute instance-attribute

Functions

create_inputs_schema() -> ValueSetSchema
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/language_processing/modules/tokens.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
def create_inputs_schema(
    self,
) -> ValueSetSchema:

    return {
        "texts_array": {
            "type": "array",
            "doc": "An array of text items to be tokenized.",
        },
        "tokenize_by_word": {
            "type": "boolean",
            "doc": "Whether to tokenize by word (default), or character.",
            "default": True,
        },
    }
create_outputs_schema() -> ValueSetSchema
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/language_processing/modules/tokens.py
144
145
146
147
148
149
150
151
152
153
def create_outputs_schema(
    self,
) -> ValueSetSchema:

    return {
        "tokens_array": {
            "type": "array",
            "doc": "The tokenized content, as an array of lists of strings.",
        }
    }
process(inputs: ValueMap, outputs: ValueMap)
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/language_processing/modules/tokens.py
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
def process(self, inputs: ValueMap, outputs: ValueMap):

    import nltk
    import polars as pl
    import pyarrow as pa

    get_stopwords()

    array: KiaraArray = inputs.get_value_data("texts_array")

    # for text in array.arrow_array:
    #     print("----")
    #     print(len(str(text)))
    tokenize_by_word: bool = inputs.get_value_data("tokenize_by_word")

    if not tokenize_by_word:
        raise KiaraProcessingException(
            "Non-word tokenization is not yet implemented."
        )

    column: pa.ChunkedArray = array.arrow_array  # type: ignore

    # warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
    def word_tokenize(word):
        return nltk.word_tokenize(word)

    series = pl.Series(name="tokens", values=column)
    result = series.apply(word_tokenize)
    result_array = result.to_arrow()

    # TODO: remove this cast once the array data type can handle non-chunked arrays
    chunked = pa.chunked_array(result_array)
    outputs.set_values(tokens_array=chunked)

AssembleStopwordsModule

Bases: KiaraModule

Create a list of stopwords from one or multiple sources.

This will download nltk stopwords if necessary, and merge all input lists into a single, sorted list without duplicates.

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/language_processing/modules/tokens.py
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
class AssembleStopwordsModule(KiaraModule):
    """Create a list of stopwords from one or multiple sources.

    This will download nltk stopwords if necessary, and merge all input lists into a single, sorted list without duplicates.
    """

    _module_type_name = "create.stopwords_list"

    def create_inputs_schema(
        self,
    ) -> ValueSetSchema:

        return {
            "languages": {
                "type": "list",
                "doc": "A list of languages, will be used to retrieve language-specific stopword from nltk.",
                "optional": True,
            },
            "stopwords": {
                "type": "list",
                "doc": "A list of additional, custom stopwords.",
                "optional": True,
            },
        }

    def create_outputs_schema(
        self,
    ) -> ValueSetSchema:

        return {
            "stopwords_list": {
                "type": "list",
                "doc": "A sorted list of unique stopwords.",
            }
        }

    def process(self, inputs: ValueMap, outputs: ValueMap):

        stopwords = set()
        _languages = inputs.get_value_obj("languages")

        if _languages.is_set:
            all_stopwords = get_stopwords()
            languages: KiaraList = _languages.data

            for language in languages.list_data:

                if language not in all_stopwords.fileids():
                    raise KiaraProcessingException(
                        f"Invalid language: {language}. Available: {', '.join(all_stopwords.fileids())}."
                    )
                stopwords.update(all_stopwords.words(language))

        _stopword_lists = inputs.get_value_obj("stopwords")
        if _stopword_lists.is_set:
            stopword_lists: KiaraList = _stopword_lists.data
            for stopword_list in stopword_lists.list_data:
                if isinstance(stopword_list, str):
                    stopwords.add(stopword_list)
                else:
                    stopwords.update(stopword_list)

        outputs.set_value("stopwords_list", sorted(stopwords))

Functions

create_inputs_schema() -> ValueSetSchema
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/language_processing/modules/tokens.py
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
def create_inputs_schema(
    self,
) -> ValueSetSchema:

    return {
        "languages": {
            "type": "list",
            "doc": "A list of languages, will be used to retrieve language-specific stopword from nltk.",
            "optional": True,
        },
        "stopwords": {
            "type": "list",
            "doc": "A list of additional, custom stopwords.",
            "optional": True,
        },
    }
create_outputs_schema() -> ValueSetSchema
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/language_processing/modules/tokens.py
215
216
217
218
219
220
221
222
223
224
def create_outputs_schema(
    self,
) -> ValueSetSchema:

    return {
        "stopwords_list": {
            "type": "list",
            "doc": "A sorted list of unique stopwords.",
        }
    }
process(inputs: ValueMap, outputs: ValueMap)
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/language_processing/modules/tokens.py
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
def process(self, inputs: ValueMap, outputs: ValueMap):

    stopwords = set()
    _languages = inputs.get_value_obj("languages")

    if _languages.is_set:
        all_stopwords = get_stopwords()
        languages: KiaraList = _languages.data

        for language in languages.list_data:

            if language not in all_stopwords.fileids():
                raise KiaraProcessingException(
                    f"Invalid language: {language}. Available: {', '.join(all_stopwords.fileids())}."
                )
            stopwords.update(all_stopwords.words(language))

    _stopword_lists = inputs.get_value_obj("stopwords")
    if _stopword_lists.is_set:
        stopword_lists: KiaraList = _stopword_lists.data
        for stopword_list in stopword_lists.list_data:
            if isinstance(stopword_list, str):
                stopwords.add(stopword_list)
            else:
                stopwords.update(stopword_list)

    outputs.set_value("stopwords_list", sorted(stopwords))

PreprocessModule

Bases: KiaraModule

Preprocess lists of tokens, incl. lowercasing, remove special characers, etc.

Lowercasing: Lowercase the words. This operation is a double-edged sword. It can be effective at yielding potentially better results in the case of relatively small datasets or datatsets with a high percentage of OCR mistakes. For instance, if lowercasing is not performed, the algorithm will treat USA, Usa, usa, UsA, uSA, etc. as distinct tokens, even though they may all refer to the same entity. On the other hand, if the dataset does not contain such OCR mistakes, then it may become difficult to distinguish between homonyms and make interpreting the topics much harder.

Removing stopwords and words with less than three characters: Remove low information words. These are typically words such as articles, pronouns, prepositions, conjunctions, etc. which are not semantically salient. There are numerous stopword lists available for many, though not all, languages which can be easily adapted to the individual researcher's needs. Removing words with less than three characters may additionally remove many OCR mistakes. Both these operations have the dual advantage of yielding more reliable results while reducing the size of the dataset, thus in turn reducing the required processing power. This step can therefore hardly be considered optional in TM.

Noise removal: Remove elements such as punctuation marks, special characters, numbers, html formatting, etc. This operation is again concerned with removing elements that may not be relevant to the text analysis and in fact interfere with it. Depending on the dataset and research question, this operation can become essential.

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/language_processing/modules/tokens.py
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
class PreprocessModule(KiaraModule):
    """Preprocess lists of tokens, incl. lowercasing, remove special characers, etc.

    Lowercasing: Lowercase the words. This operation is a double-edged sword. It can be effective at yielding potentially better results in the case of relatively small datasets or datatsets with a high percentage of OCR mistakes. For instance, if lowercasing is not performed, the algorithm will treat USA, Usa, usa, UsA, uSA, etc. as distinct tokens, even though they may all refer to the same entity. On the other hand, if the dataset does not contain such OCR mistakes, then it may become difficult to distinguish between homonyms and make interpreting the topics much harder.

    Removing stopwords and words with less than three characters: Remove low information words. These are typically words such as articles, pronouns, prepositions, conjunctions, etc. which are not semantically salient. There are numerous stopword lists available for many, though not all, languages which can be easily adapted to the individual researcher's needs. Removing words with less than three characters may additionally remove many OCR mistakes. Both these operations have the dual advantage of yielding more reliable results while reducing the size of the dataset, thus in turn reducing the required processing power. This step can therefore hardly be considered optional in TM.

    Noise removal: Remove elements such as punctuation marks, special characters, numbers, html formatting, etc. This operation is again concerned with removing elements that may not be relevant to the text analysis and in fact interfere with it. Depending on the dataset and research question, this operation can become essential.
    """

    _module_type_name = "preprocess.tokens_array"

    KIARA_METADATA = {
        "tags": ["tokens", "preprocess"],
    }

    def create_inputs_schema(
        self,
    ) -> ValueSetSchema:

        return {
            "tokens_array": {
                "type": "array",
                "doc": "The tokens array to pre-process.",
            },
            "to_lowercase": {
                "type": "boolean",
                "doc": "Apply lowercasing to the text.",
                "default": False,
            },
            "remove_alphanumeric": {
                "type": "boolean",
                "doc": "Remove all tokens that include numbers (e.g. ex1ample).",
                "default": False,
            },
            "remove_non_alpha": {
                "type": "boolean",
                "doc": "Remove all tokens that include punctuation and numbers (e.g. ex1a.mple).",
                "default": False,
            },
            "remove_all_numeric": {
                "type": "boolean",
                "doc": "Remove all tokens that contain numbers only (e.g. 876).",
                "default": False,
            },
            "remove_short_tokens": {
                "type": "integer",
                "doc": "Remove tokens shorter or equal to this value. If value is <= 0, no filtering will be done.",
                "default": 0,
            },
            "remove_stopwords": {
                "type": "list",
                "doc": "Remove stopwords.",
                "optional": True,
            },
        }

    def create_outputs_schema(
        self,
    ) -> ValueSetSchema:

        return {
            "tokens_array": {
                "type": "array",
                "doc": "The pre-processed content, as an array of lists of strings.",
            }
        }

    def process(self, inputs: ValueMap, outputs: ValueMap):

        import polars as pl
        import pyarrow as pa

        tokens_array: KiaraArray = inputs.get_value_data("tokens_array")
        lowercase: bool = inputs.get_value_data("to_lowercase")
        remove_alphanumeric: bool = inputs.get_value_data("remove_alphanumeric")
        remove_non_alpha: bool = inputs.get_value_data("remove_non_alpha")
        remove_all_numeric: bool = inputs.get_value_data("remove_all_numeric")
        remove_short_tokens: int = inputs.get_value_data("remove_short_tokens")

        if remove_short_tokens is None:
            remove_short_tokens = -1

        _remove_stopwords = inputs.get_value_obj("remove_stopwords")
        if _remove_stopwords.is_set:
            stopword_list: Union[Iterable[str], None] = _remove_stopwords.data.list_data
        else:
            stopword_list = None

        # it's better to have one method every token goes through, then do every test seperately for the token list
        # because that way each token only needs to be touched once (which is more effective)
        def check_token(token: str) -> Union[str, None]:

            # remove short tokens first, since we can save ourselves all the other checks (which are more expensive)
            assert isinstance(remove_short_tokens, int)

            if remove_short_tokens > 0:
                if len(token) <= remove_short_tokens:
                    return None

            _token: str = token
            if lowercase:
                _token = _token.lower()

            if remove_non_alpha:
                match = _token if _token.isalpha() else None
                if match is None:
                    return None

            # if remove_non_alpha was set, we don't need to worry about tokens that include numbers, since they are already filtered out
            if remove_alphanumeric and not remove_non_alpha:
                match = _token if _token.isalnum() else None
                if match is None:
                    return None

            # all-number tokens are already filtered out if the remove_non_alpha methods above ran
            if remove_all_numeric and not remove_non_alpha:
                match = None if _token.isdigit() else _token
                if match is None:
                    return None

            if stopword_list and _token and _token.lower() in stopword_list:
                return None

            return _token

        series = pl.Series(name="tokens", values=tokens_array.arrow_array)

        result = series.apply(
            lambda token_list: [
                x for x in (check_token(token) for token in token_list) if x is not None
            ]
        )
        result_array = result.to_arrow()

        # TODO: remove this cast once the array data type can handle non-chunked arrays
        chunked = pa.chunked_array(result_array)
        outputs.set_values(tokens_array=chunked)

Attributes

KIARA_METADATA = {'tags': ['tokens', 'preprocess']} class-attribute instance-attribute

Functions

create_inputs_schema() -> ValueSetSchema
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/language_processing/modules/tokens.py
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
def create_inputs_schema(
    self,
) -> ValueSetSchema:

    return {
        "tokens_array": {
            "type": "array",
            "doc": "The tokens array to pre-process.",
        },
        "to_lowercase": {
            "type": "boolean",
            "doc": "Apply lowercasing to the text.",
            "default": False,
        },
        "remove_alphanumeric": {
            "type": "boolean",
            "doc": "Remove all tokens that include numbers (e.g. ex1ample).",
            "default": False,
        },
        "remove_non_alpha": {
            "type": "boolean",
            "doc": "Remove all tokens that include punctuation and numbers (e.g. ex1a.mple).",
            "default": False,
        },
        "remove_all_numeric": {
            "type": "boolean",
            "doc": "Remove all tokens that contain numbers only (e.g. 876).",
            "default": False,
        },
        "remove_short_tokens": {
            "type": "integer",
            "doc": "Remove tokens shorter or equal to this value. If value is <= 0, no filtering will be done.",
            "default": 0,
        },
        "remove_stopwords": {
            "type": "list",
            "doc": "Remove stopwords.",
            "optional": True,
        },
    }
create_outputs_schema() -> ValueSetSchema
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/language_processing/modules/tokens.py
398
399
400
401
402
403
404
405
406
407
def create_outputs_schema(
    self,
) -> ValueSetSchema:

    return {
        "tokens_array": {
            "type": "array",
            "doc": "The pre-processed content, as an array of lists of strings.",
        }
    }
process(inputs: ValueMap, outputs: ValueMap)
Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/language_processing/modules/tokens.py
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
def process(self, inputs: ValueMap, outputs: ValueMap):

    import polars as pl
    import pyarrow as pa

    tokens_array: KiaraArray = inputs.get_value_data("tokens_array")
    lowercase: bool = inputs.get_value_data("to_lowercase")
    remove_alphanumeric: bool = inputs.get_value_data("remove_alphanumeric")
    remove_non_alpha: bool = inputs.get_value_data("remove_non_alpha")
    remove_all_numeric: bool = inputs.get_value_data("remove_all_numeric")
    remove_short_tokens: int = inputs.get_value_data("remove_short_tokens")

    if remove_short_tokens is None:
        remove_short_tokens = -1

    _remove_stopwords = inputs.get_value_obj("remove_stopwords")
    if _remove_stopwords.is_set:
        stopword_list: Union[Iterable[str], None] = _remove_stopwords.data.list_data
    else:
        stopword_list = None

    # it's better to have one method every token goes through, then do every test seperately for the token list
    # because that way each token only needs to be touched once (which is more effective)
    def check_token(token: str) -> Union[str, None]:

        # remove short tokens first, since we can save ourselves all the other checks (which are more expensive)
        assert isinstance(remove_short_tokens, int)

        if remove_short_tokens > 0:
            if len(token) <= remove_short_tokens:
                return None

        _token: str = token
        if lowercase:
            _token = _token.lower()

        if remove_non_alpha:
            match = _token if _token.isalpha() else None
            if match is None:
                return None

        # if remove_non_alpha was set, we don't need to worry about tokens that include numbers, since they are already filtered out
        if remove_alphanumeric and not remove_non_alpha:
            match = _token if _token.isalnum() else None
            if match is None:
                return None

        # all-number tokens are already filtered out if the remove_non_alpha methods above ran
        if remove_all_numeric and not remove_non_alpha:
            match = None if _token.isdigit() else _token
            if match is None:
                return None

        if stopword_list and _token and _token.lower() in stopword_list:
            return None

        return _token

    series = pl.Series(name="tokens", values=tokens_array.arrow_array)

    result = series.apply(
        lambda token_list: [
            x for x in (check_token(token) for token in token_list) if x is not None
        ]
    )
    result_array = result.to_arrow()

    # TODO: remove this cast once the array data type can handle non-chunked arrays
    chunked = pa.chunked_array(result_array)
    outputs.set_values(tokens_array=chunked)

Functions

init_nltk()

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/language_processing/modules/tokens.py
19
20
21
22
23
def init_nltk():

    import nltk

    nltk.data.path = [NLTK_DOWNLOAD_DIR]

get_stopwords()

Source code in /opt/hostedtoolcache/Python/3.11.4/x64/lib/python3.11/site-packages/kiara_plugin/language_processing/modules/tokens.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def get_stopwords():

    global _nltk_stopwords  # noqa
    if _nltk_stopwords is not None:
        return _nltk_stopwords

    # TODO: make that smarter
    import nltk

    init_nltk()

    output = io.StringIO()
    nltk.download("punkt", print_error_to=output, download_dir=NLTK_DOWNLOAD_DIR)
    nltk.download("stopwords", print_error_to=output, download_dir=NLTK_DOWNLOAD_DIR)

    log.debug("external.message", source="nltk", msg=output.getvalue())
    from nltk.corpus import stopwords

    _nltk_stopwords = stopwords
    return stopwords