diff --git a/transformations/random_swap/README.md b/transformations/random_swap/README.md new file mode 100644 index 000000000..6aa9bbc24 --- /dev/null +++ b/transformations/random_swap/README.md @@ -0,0 +1,59 @@ +# Random Swap +This perturbation adds noise to all types of text sources (sentence, paragraph, etc.) by randomly swapping words which are close to each other in a sentence. + +Author1 name: Tshephisho Sefara + +Author1 email: [sefaratj@gmail.com](mailto:sefaratj@gmail.com) + +Author1 Affiliation: Council for Scientific and Industrial Research + +Author2 name: Vukosi Marivate + +Author2 email: [vukosi.marivate@cs.up.ac.za](mailto:vukosi.marivate@cs.up.ac.za), [vima@vima.co.za](mailto:vima@vima.co.za) + +Author2 Affiliation: Department of Computer Science, University of Pretoria + +## What type of a transformation is this? +This transformation could augment the semantic representation of the sentence as well as test model robustness by swapping words. + + +## What tasks does it intend to benefit? +This perturbation would benefit all tasks on text classification and generation. + +Benchmark results: + +- Text Classification: we run sentiment analysis on a 1% sample of the IMDB dataset. The original accuracy is 96.0 and the perturbed accuracy is 96.0. +``` +{'accuracy': 96.0, + 'dataset_name': 'imdb', + 'model_name': 'aychang/roberta-base-imdb', + 'no_of_examples': 250, + 'pt_accuracy': 96.0, + 'split': 'test[:1%]'} +``` +- Text summarization: we run text summarization on a 1% sample of the xsum dataset. The original bleu is 15.99 and the perturbed bleu is 16.1. +``` +{'bleu': 15.989230311212195, + 'dataset_name': 'xsum', + 'model_name': 'sshleifer/distilbart-xsum-12-6', + 'pt_bleu': 16.09338711985113, + 'split': 'test[:1%]'} +``` +## Related Work +This perturbation is adapted from our TextAugment library https://github.com/dsfsi/textaugment +```bibtex +@inproceedings{marivate2020improving, + title={Improving short text classification through global augmentation methods}, + author={Marivate, Vukosi and Sefara, Tshephisho}, + booktitle={International Cross-Domain Conference for Machine Learning and Knowledge Extraction}, + pages={385--399}, + year={2020}, + organization={Springer} +} +``` + + + + +## What are the limitations of this transformation? +The transformation's outputs may change the meaning of the sentence by adding grammatical errors. diff --git a/transformations/random_swap/__init__.py b/transformations/random_swap/__init__.py new file mode 100644 index 000000000..930cdce0b --- /dev/null +++ b/transformations/random_swap/__init__.py @@ -0,0 +1 @@ +from .transformation import * diff --git a/transformations/random_swap/requirements.txt b/transformations/random_swap/requirements.txt new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/transformations/random_swap/requirements.txt @@ -0,0 +1 @@ + diff --git a/transformations/random_swap/test.json b/transformations/random_swap/test.json new file mode 100644 index 000000000..8603ec9cc --- /dev/null +++ b/transformations/random_swap/test.json @@ -0,0 +1,60 @@ +{ + "type": "random_swap", + "test_cases": [ + { + "class": "RandomSwap", + "inputs": { + "sentence": "Andrew finally returned the French book to Chris that I bought last week" + }, + "outputs": [ + { + "sentence": "Andrew finally returned the French to book Chris that I bought last week" + } + ] + }, + { + "class": "RandomSwap", + "inputs": { + "sentence": "Sentences with gapping, such as Paul likes coffee and Mary tea, lack an overt predicate to indicate the relation between two or more arguments." + }, + "outputs": [ + { + "sentence": "Sentences with gapping, such as Paul likes coffee and Mary tea, an lack overt predicate to indicate the relation between two or more arguments." + } + ] + }, + { + "class": "RandomSwap", + "inputs": { + "sentence": "Alice in Wonderland is a 2010 American live-action/animated dark fantasy adventure film" + }, + "outputs": [ + { + "sentence": "Alice in Wonderland is a 2010 American live - action / animated dark adventure fantasy film" + } + ] + }, + { + "class": "RandomSwap", + "inputs": { + "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001" + }, + "outputs": [ + { + "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 2001 to" + } + ] + }, + { + "class": "RandomSwap", + "inputs": { + "sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization." + }, + "outputs": [ + { + "sentence": "Neuroplasticity is a continuous processing allowing short - term, medium - term, and term - long remodeling of the neuronosynaptic organization." + } + ] + } + ] +} diff --git a/transformations/random_swap/transformation.py b/transformations/random_swap/transformation.py new file mode 100644 index 000000000..1237031ce --- /dev/null +++ b/transformations/random_swap/transformation.py @@ -0,0 +1,110 @@ +import random +import re +from abc import ABC + +import spacy + +from interfaces.SentenceOperation import SentenceOperation +from tasks.TaskTypes import TaskType +from initialize import spacy_nlp + +""" +Base Class for implementing the different input transformations a generation should be robust against. +""" + + +class SwapWordTransformation: + nlp = None + + def __init__(self, seed=0, max_outputs=1, prob=0.5): + self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") + self.max_outputs = max_outputs + self.seed = seed + self.prob = prob + + @staticmethod + def untokenize(words: list): + """ + Untokenizing a text undoes the tokenizing operation, restoring + punctuation and spaces to the places that people expect them to be. + Ideally, `untokenize(tokenize(text))` should be identical to `text`, + except for line breaks. + ref: https://github.com/commonsense/metanl/blob/master/metanl/token_utils.py#L28 + """ + text = " ".join(words) + step1 = ( + text.replace("`` ", '"').replace(" ''", '"').replace(". . .", "...") + ) + step2 = step1.replace(" ( ", " (").replace(" ) ", ") ") + step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2) + step4 = re.sub(r" ([.,:;?!%]+)$", r"\1", step3) + step5 = ( + step4.replace(" '", "'") + .replace(" n't", "n't") + .replace("can not", "cannot") + ) + step6 = step5.replace(" ` ", " '") + return step6.strip() + + def transform(self, input_text: str): + random.seed(self.seed) + doc = self.nlp(input_text) + results = list() + for _ in range(self.max_outputs): + doc_list = [i.text for i in doc] + doc_idx = [] + for _, i in enumerate(doc): + if i.pos_ != 'PUNCT': + doc_idx.append(_) + + random_start_idx = random.randint(0, len(doc_idx) - 1) + swap_direction = [-1, 1] + if len(doc_idx) in [0, 1]: + results.append(doc.text) + continue + if random_start_idx == 0: + swap_word_idx = random_start_idx+swap_direction[1] + elif random_start_idx == len(doc_idx): + swap_word_idx = random_start_idx+swap_direction[0] + else: + swap_word_idx = random_start_idx + random.choice(swap_direction) + + random_start = doc_idx[random_start_idx] + swap_word_idx = doc_idx[swap_word_idx] + + random_start_word = doc[random_start].text + swap_word = doc[swap_word_idx].text + + doc_list[random_start] = swap_word + doc_list[swap_word_idx] = random_start_word + result = self.untokenize(doc_list) + results.append(result) + return results + + +""" +Randomly swap words that are close to each other. +""" + + +class RandomSwap(SentenceOperation, ABC): + """ + This class is an implementation of random swapping of words in a sentence. Created by the Authors of TextAugment + https://github.com/dsfsi/textaugment + """ + tasks = [TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TO_TEXT_GENERATION] + languages = ["en"] + + def __init__(self, seed=0, prob=0.5, max_outputs=1): + super().__init__(seed, max_outputs=max_outputs) + self.swap_word_transformation = SwapWordTransformation( + seed, max_outputs, prob + ) + + def generate(self, sentence: str): + result = self.swap_word_transformation.transform( + input_text=sentence, + ) + if self.verbose: + print(f"Perturbed Input from {self.name()} : {result}") + return result