GEM-benchmark · vukosim · Jul 24, 2021
diff --git a/transformations/random_swap/README.md b/transformations/random_swap/README.md
@@ -0,0 +1,59 @@
+# Random Swap
+This perturbation adds noise to all types of text sources (sentence, paragraph, etc.) by randomly swapping words which are close to each other in a sentence.
+
+Author1 name: Tshephisho Sefara
+
+Author1 email: [sefaratj@gmail.com](mailto:sefaratj@gmail.com)
+
+Author1 Affiliation: Council for Scientific and Industrial Research
+
+Author2 name: Vukosi Marivate
+
+Author2 email: [vukosi.marivate@cs.up.ac.za](mailto:vukosi.marivate@cs.up.ac.za), [vima@vima.co.za](mailto:vima@vima.co.za)
+
+Author2 Affiliation: Department of Computer Science, University of Pretoria
+
+## What type of a transformation is this?
+This transformation could augment the semantic representation of the sentence as well as test model robustness by swapping words.
+
+
+## What tasks does it intend to benefit?
+This perturbation would benefit all tasks on text classification and generation.
+
+Benchmark results:
+
+- Text Classification: we run sentiment analysis on a 1% sample of the IMDB dataset. The original accuracy is 96.0 and the perturbed accuracy is 96.0.
+```
+{'accuracy': 96.0,
+ 'dataset_name': 'imdb',
+ 'model_name': 'aychang/roberta-base-imdb',
+ 'no_of_examples': 250,
+ 'pt_accuracy': 96.0,
+ 'split': 'test[:1%]'}
+```
+- Text summarization: we run text summarization on a 1% sample of the xsum dataset. The original bleu is 15.99 and the perturbed bleu is 16.1.
+```
+{'bleu': 15.989230311212195,
+ 'dataset_name': 'xsum',
+ 'model_name': 'sshleifer/distilbart-xsum-12-6',
+ 'pt_bleu': 16.09338711985113,
+ 'split': 'test[:1%]'}
+```
+## Related Work
+This perturbation is adapted from our TextAugment library https://github.com/dsfsi/textaugment
+```bibtex
+@inproceedings{marivate2020improving,
+  title={Improving short text classification through global augmentation methods},
+  author={Marivate, Vukosi and Sefara, Tshephisho},
+  booktitle={International Cross-Domain Conference for Machine Learning and Knowledge Extraction},
+  pages={385--399},
+  year={2020},
+  organization={Springer}
+}
+```
+
+
+
+
+## What are the limitations of this transformation?
+The transformation's outputs may change the meaning of the sentence by adding grammatical errors. 
diff --git a/transformations/random_swap/__init__.py b/transformations/random_swap/__init__.py
@@ -0,0 +1 @@
+from .transformation import *
diff --git a/transformations/random_swap/requirements.txt b/transformations/random_swap/requirements.txt
@@ -0,0 +1 @@
+
diff --git a/transformations/random_swap/test.json b/transformations/random_swap/test.json
@@ -0,0 +1,60 @@
+{
+  "type": "random_swap",
+  "test_cases": [
+    {
+      "class": "RandomSwap",
+      "inputs": {
+        "sentence": "Andrew finally returned the French book to Chris that I bought last week"
+      },
+      "outputs": [
+        {
+          "sentence": "Andrew finally returned the French to book Chris that I bought last week"
+        }
+      ]
+    },
+    {
+      "class": "RandomSwap",
+      "inputs": {
+        "sentence": "Sentences with gapping, such as Paul likes coffee and Mary tea, lack an overt predicate to indicate the relation between two or more arguments."
+      },
+      "outputs": [
+        {
+          "sentence": "Sentences with gapping, such as Paul likes coffee and Mary tea, an lack overt predicate to indicate the relation between two or more arguments."
+        }
+      ]
+    },
+    {
+      "class": "RandomSwap",
+      "inputs": {
+        "sentence": "Alice in Wonderland is a 2010 American live-action/animated dark fantasy adventure film"
+      },
+      "outputs": [
+        {
+          "sentence": "Alice in Wonderland is a 2010 American live - action / animated dark adventure fantasy film"
+        }
+      ]
+    },
+    {
+      "class": "RandomSwap",
+      "inputs": {
+        "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001"
+      },
+      "outputs": [
+        {
+          "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 2001 to"
+        }
+      ]
+    },
+    {
+      "class": "RandomSwap",
+      "inputs": {
+        "sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization."
+      },
+      "outputs": [
+        {
+          "sentence": "Neuroplasticity is a continuous processing allowing short - term, medium - term, and term - long remodeling of the neuronosynaptic organization."
+        }
+      ]
+    }
+  ]
+}
diff --git a/transformations/random_swap/transformation.py b/transformations/random_swap/transformation.py
@@ -0,0 +1,110 @@
+import random
+import re
+from abc import ABC
+
+import spacy
+
+from interfaces.SentenceOperation import SentenceOperation
+from tasks.TaskTypes import TaskType
+from initialize import spacy_nlp
+
+"""
+Base Class for implementing the different input transformations a generation should be robust against.
+"""
+
+
+class SwapWordTransformation:
+    nlp = None
+
+    def __init__(self, seed=0, max_outputs=1, prob=0.5):
+        self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm")
+        self.max_outputs = max_outputs
+        self.seed = seed
+        self.prob = prob
+
+    @staticmethod
+    def untokenize(words: list):
+        """
+        Untokenizing a text undoes the tokenizing operation, restoring
+        punctuation and spaces to the places that people expect them to be.
+        Ideally, `untokenize(tokenize(text))` should be identical to `text`,
+        except for line breaks.
+        ref: https://github.com/commonsense/metanl/blob/master/metanl/token_utils.py#L28
+        """
+        text = " ".join(words)
+        step1 = (
+            text.replace("`` ", '"').replace(" ''", '"').replace(". . .", "...")
+        )
+        step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")
+        step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)
+        step4 = re.sub(r" ([.,:;?!%]+)$", r"\1", step3)
+        step5 = (
+            step4.replace(" '", "'")
+            .replace(" n't", "n't")
+            .replace("can not", "cannot")
+        )
+        step6 = step5.replace(" ` ", " '")
+        return step6.strip()
+
+    def transform(self, input_text: str):
+        random.seed(self.seed)
+        doc = self.nlp(input_text)
+        results = list()
+        for _ in range(self.max_outputs):
+            doc_list = [i.text for i in doc]
+            doc_idx = []
+            for _, i in enumerate(doc):
+                if i.pos_ != 'PUNCT':
+                    doc_idx.append(_)
+
+            random_start_idx = random.randint(0, len(doc_idx) - 1)
+            swap_direction = [-1, 1]
+            if len(doc_idx) in [0, 1]:
+                results.append(doc.text)
+                continue
+            if random_start_idx == 0:
+                swap_word_idx = random_start_idx+swap_direction[1]
+            elif random_start_idx == len(doc_idx):
+                swap_word_idx = random_start_idx+swap_direction[0]
+            else:
+                swap_word_idx = random_start_idx + random.choice(swap_direction)
+
+            random_start = doc_idx[random_start_idx]
+            swap_word_idx = doc_idx[swap_word_idx]
+
+            random_start_word = doc[random_start].text
+            swap_word = doc[swap_word_idx].text
+
+            doc_list[random_start] = swap_word
+            doc_list[swap_word_idx] = random_start_word
+            result = self.untokenize(doc_list)
+            results.append(result)
+        return results
+
+
+"""
+Randomly swap words that are close to each other.
+"""
+
+
+class RandomSwap(SentenceOperation, ABC):
+    """
+    This class is an implementation of random swapping of words in a sentence. Created by the Authors of TextAugment
+    https://github.com/dsfsi/textaugment
+    """
+    tasks = [TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TO_TEXT_GENERATION]
+    languages = ["en"]
+
+    def __init__(self, seed=0, prob=0.5, max_outputs=1):
+        super().__init__(seed, max_outputs=max_outputs)
+        self.swap_word_transformation = SwapWordTransformation(
+            seed, max_outputs, prob
+        )
+
+    def generate(self, sentence: str):
+        result = self.swap_word_transformation.transform(
+            input_text=sentence,
+        )
+        if self.verbose:
+            print(f"Perturbed Input from {self.name()} : {result}")
+        return result