Spaces:

robustness-gym
/

summvis

Runtime error

App Files Files Community

cbensimon HF Staff commited on May 27, 2021

Commit

6124176

unverified ·

0 Parent(s):

Initial commit

Browse files

Files changed (21) hide show

.gitattributes +18 -0
.gitignore +1 -0
LICENSE +201 -0
README.md +372 -0
align.py +346 -0
app.py +375 -0
components.py +563 -0
data/10:cnn_dailymail_1000.validation/_dataset/data.gz +3 -0
data/10:cnn_dailymail_1000.validation/metadata.json +1 -0
generation.py +142 -0
preprocessing.py +701 -0
requirements.txt +11 -0
resources/jquery.color-2.1.2.min.js +2 -0
resources/summvis.css +347 -0
resources/summvis.js +518 -0
utils.py +6 -0
website/annotations.png +0 -0
website/demo.gif +0 -0
website/main-vis.jpg +0 -0
website/title.png +0 -0
website/triangle.png +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,18 @@

+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .DS_STORE

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2021 SummVis
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,372 @@

+# SummVis
+SummVis is an open-source visualization tool that supports fine-grained analysis of summarization models, data, and evaluation
+metrics. Through its lexical and semantic visualizations, SummVis enables in-depth exploration across important dimensions such as factual consistency and abstractiveness.
+Authors: [Jesse Vig](https://twitter.com/jesse_vig)<sup>1</sup>,
+[Wojciech Kryściński](https://twitter.com/iam_wkr)<sup>1</sup>,
+ [Karan Goel](https://twitter.com/krandiash)<sup>2</sup>,
+  [Nazneen Fatema Rajani](https://twitter.com/nazneenrajani)<sup>1</sup><br/>
+  <sup>1</sup>[Salesforce Research](https://einstein.ai/) <sup>2</sup>[Stanford Hazy Research](https://hazyresearch.stanford.edu/)
+📖 [Paper](https://arxiv.org/abs/2104.07605)
+🎥 [Demo](https://vimeo.com/540429745)
+<p>
+    <img src="website/demo.gif" alt="Demo gif"/>
+</p>
+_Note: SummVis is under active development, so expect continued updates in the coming weeks and months.
+ Feel free to raise issues for questions, suggestions, requests or bug reports._
+## Table of Contents
+- [User guide](#user-guide)
+- [Installation](#installation)
+- [Quickstart](#quickstart)
+- [Running with pre-loaded datasets](#running-with-pre-loaded-datasets)
+- [Get your data into SummVis](#get-your-data-into-summvis)
+- [Citation](#citation)
+- [Acknowledgements](#acknowledgements)
+## User guide
+### Overview
+SummVis is a tool for analyzing abstractive summarization systems. It provides fine-grained insights on summarization
+models, data, and evaluation metrics by visualizing the relationships between source documents, reference summaries,
+and generated summaries, as illustrated in the figure below.<br/>
+![Relations between source, reference, and generated summaries](website/triangle.png)
+### Interface
+The SummVis interface is shown below. The example displayed is the first record from the
+ [CNN / Daily Mail](https://huggingface.co/datasets/cnn_dailymail) validation set.
+![Main interface](website/main-vis.jpg)
+#### Components
+**(a)** Configuration panel<br/>
+**(b)** Source document (or reference summary, depending on configuration)<br/>
+**(c)** Generated summaries (and/or reference summary, depending on configuration)<br/>
+**(d)** Scroll bar with global view of annotations<br/>
+#### Annotations
+<img src="website/annotations.png" width="548" height="39" alt="Annotations"/>
+**N-gram overlap:** Word sequences that overlap between the document on the left and
+ the selected summary on the right. Underlines are color-coded by index of summary sentence. <br/>
+**Semantic overlap**: Words in the summary that are semantically close to one or more words in document on the left.<br/>
+**Novel words**: Words in the summary that do not appear in the document on the left.<br/>
+**Novel entities**: Entity words in the summary that do not appear in the document on the left.<br/>
+### Limitations
+Currently only English text is supported.
+## Installation
+**IMPORTANT**: Please use `python>=3.8` since some dependencies require that for installation.
+```shell
+# Requires python>=3.8
+git clone https://github.com/robustness-gym/summvis.git
+cd summvis
+pip install -r requirements.txt
+python -m spacy download en_core_web_sm
+```
+Installation takes around 2 minutes on a Macbook Pro.
+## Quickstart
+Follow the steps below to start using SummVis immediately.
+### 1. Download and extract data
+Download our pre-cached dataset that contains predictions for state-of-the-art models such as PEGASUS and BART on
+1000 examples taken from the CNN / Daily Mail validation set.
+```shell
+mkdir data
+mkdir preprocessing
+curl https://storage.googleapis.com/sfr-summvis-data-research/cnn_dailymail_1000.validation.anonymized.zip --output preprocessing/cnn_dailymail_1000.validation.anonymized.zip
+unzip preprocessing/cnn_dailymail_1000.validation.anonymized.zip -d preprocessing/
+```
+### 2. Deanonymize data
+Next, we'll need to add the original examples from the CNN / Daily Mail dataset to deanonymize the data (this information
+is omitted for copyright reasons). The `preprocessing.py` script can be used for this with the `--deanonymize` flag.
+#### Deanonymize 10 examples:
+```shell
+python preprocessing.py \
+--deanonymize \
+--dataset_rg preprocessing/cnn_dailymail_1000.validation.anonymized \
+--dataset cnn_dailymail \
+--version 3.0.0 \
+--split validation \
+--processed_dataset_path data/10:cnn_dailymail_1000.validation \
+--n_samples 10
+```
+This will take either a few seconds or a few minutes depending on whether you've previously loaded CNN/DailyMail from
+the Datasets library.
+### 3. Run SummVis
+Finally, we're ready to run the Streamlit app. Once the app loads, make sure it's pointing to the right `File` at the top
+of the interface.
+```shell
+streamlit run summvis.py
+```
+## Running with pre-loaded datasets
+In this section we extend the approach described in [Quickstart](#quickstart) to other pre-loaded datasets.
+### 1. Download one of the pre-loaded datasets:
+##### CNN / Daily Mail (1000 examples from validation set): https://storage.googleapis.com/sfr-summvis-data-research/cnn_dailymail_1000.validation.anonymized.zip
+##### CNN / Daily Mail (full validation set): https://storage.googleapis.com/sfr-summvis-data-research/cnn_dailymail.validation.anonymized.zip
+##### XSum (1000 examples from validation set): https://storage.googleapis.com/sfr-summvis-data-research/xsum_1000.validation.anonymized.zip
+##### XSum (full validation set): https://storage.googleapis.com/sfr-summvis-data-research/xsum.validation.anonymized.zip
+We recommend that you choose the smallest dataset that fits your need in order to minimize download / preprocessing time.
+#### Example: Download and unzip CNN / Daily Mail
+```shell
+mkdir data
+mkdir preprocessing
+curl https://storage.googleapis.com/sfr-summvis-data-research/cnn_dailymail_1000.validation.anonymized.zip --output preprocessing/cnn_dailymail_1000.validation.anonymized.zip
+unzip preprocessing/cnn_dailymail_1000.validation.anonymized.zip -d preprocessing/
+```
+#### Example: Download and unzip XSum
+```shell
+mkdir data
+mkdir preprocessing
+curl https://storage.googleapis.com/sfr-summvis-data-research/xsum_1000.validation.anonymized.zip --output preprocessing/xsum_1000.validation.anonymized.zip
+unzip preprocessing/xsum_1000.validation.anonymized.zip -d preprocessing/
+```
+### 2. Deanonymize *n* examples:
+Set the `--n_samples` argument and name the `--processed_dataset_path` output file accordingly.
+#### Example: Deanonymize 100 examples from CNN / Daily Mail:
+```shell
+python preprocessing.py \
+--deanonymize \
+--dataset_rg preprocessing/cnn_dailymail_1000.validation.anonymized \
+--dataset cnn_dailymail \
+--version 3.0.0 \
+--split validation \
+--processed_dataset_path data/100:cnn_dailymail_1000.validation \
+--n_samples 100
+```
+#### Example: Deanonymize all pre-loaded examples from CNN / Daily Mail (1000 examples dataset):
+```shell
+python preprocessing.py \
+--deanonymize \
+--dataset_rg preprocessing/cnn_dailymail_1000.validation.anonymized \
+--dataset cnn_dailymail \
+--version 3.0.0 \
+--split validation \
+--processed_dataset_path data/full:cnn_dailymail_1000.validation \
+--n_samples 1000
+```
+#### Example: Deanonymize all pre-loaded examples from CNN / Daily Mail (full dataset):
+```shell
+python preprocessing.py \
+--deanonymize \
+--dataset_rg preprocessing/cnn_dailymail.validation.anonymized \
+--dataset cnn_dailymail \
+--version 3.0.0 \
+--split validation \
+--processed_dataset_path data/full:cnn_dailymail.validation
+```
+#### Example: Deanonymize all pre-loaded examples from XSum (1000 examples dataset):
+```shell
+python preprocessing.py \
+--deanonymize \
+--dataset_rg preprocessing/xsum_1000.validation.anonymized \
+--dataset xsum \
+--split validation \
+--processed_dataset_path data/full:xsum_1000.validation \
+--n_samples 1000
+```
+### 3. Run SummVis
+Once the app loads, make sure it's pointing to the right `File` at the top
+of the interface.
+```shell
+streamlit run summvis.py
+```
+Alternately, if you need to point SummVis to a folder where your data is stored.
+```shell
+streamlit run summvis.py -- --path your/path/to/data
+```
+Note that the additional `--` is not a mistake, and is required to pass command-line arguments in streamlit.
+## Get your data into SummVis
+The simplest way to use SummVis with your own data is to create a jsonl file of the following format:
+```
+{"document":  "This is the first source document", "summary:reference": "This is the reference summary", "summary:testmodel1": "This is the summary for testmodel1", "summary:testmodel2": "This is the summary for testmodel2"}
+{"document":  "This is the second source document", "summary:reference": "This is the reference summary", "summary:testmodel1": "This is the summary for testmodel1", "summary:testmodel2": "This is the summary for testmodel2"}
+```
+The key for the reference summary must equal `summary:reference` and the key for any other summary must be of the form
+`summary:<summary_name>`, e.g. `summary:BART`. The document and at least one summary (reference, other, or both) are required.
+The following additional install step is required.:
+```
+python -m spacy download en_core_web_lg
+```
+You have two options to load this jsonl file into the tool:
+#### Option 1: Load the jsonl file directly
+The disadvantage of this approach is that all computations are performed in realtime. This is particularly expensive for
+semantic similarity, which uses a Transformer model. At a result, each example will be slow to load (~5-15 seconds on a Macbook Pro).
+1. Place the jsonl file in the `data` directory. Note that the file must be named with a `.jsonl` extension.
+2. Start SummVis: `streamlit run summvis.py`
+3. Select your jsonl file from the `File` dropdown at the top of the interface.
+#### Option 2: Preprocess jsonl file (recommended)
+You may run `preprocessing.py` to precompute all data required in the interface (running `spaCy`, lexical and semantic
+ aligners) and save a cache file, which can be read directly into the tool. Note that this script may run for a while
+  (~5-15 seconds per example on a MacBook Pro for
+ documents of typical length found in CNN/DailyMail or XSum), and will be greatly expedited by running on a GPU.
+1. Run preprocessing script to generate cache file
+    ```shell
+    python preprocessing.py \
+    --workflow \
+    --dataset_jsonl path/to/my_dataset.jsonl \
+    --processed_dataset_path path/to/my_cache_file
+    ```
+     You may wish to first try it with a subset of your data by adding the following argument: `--n_samples <number_of_samples>`.
+2. Copy output cache file to the `data` directory
+3. Start SummVis: `streamlit run summvis.py`
+4. Select your file from the `File` dropdown at the top of the interface.
+As an alternative to steps 2-3, you may point SummVis to a folder in which the cache file is stored:
+```shell
+streamlit run summvis.py -- --path <parent_directory_of_cache_file>
+```
+### Generating predictions
+The instructions in the previous section assume access to model predictions. We also provide tools to load predictions,
+ either by downloading datasets with precomputed predictions or running
+a script to generate predictions for HuggingFace-compatible models. In this section we describe an end-to-end pipeline
+for using these tools.
+Prior to running the following, an additional install step is required:
+```
+python -m spacy download en_core_web_lg
+```
+#### 1. Standardize and save dataset to disk.
+Loads in a dataset from HF, or any dataset that you have and stores it in a
+standardized format with columns for `document` and `summary:reference`.
+##### Example: Save CNN / Daily Mail validation split to disk as a jsonl file.
+```shell
+python preprocessing.py \
+--standardize \
+--dataset cnn_dailymail \
+--version 3.0.0 \
+--split validation \
+--save_jsonl_path preprocessing/cnn_dailymail.validation.jsonl
+```
+##### Example: Load custom `my_dataset.jsonl`, standardize, and save.
+```shell
+python preprocessing.py \
+--standardize \
+--dataset_jsonl path/to/my_dataset.jsonl \
+--save_jsonl_path preprocessing/my_dataset.jsonl
+```
+Expected format of `my_dataset.jsonl`:
+ ```
+{"document":  "This is the first source document", "summary:reference": "This is the reference summary"}
+{"document":  "This is the second source document", "summary:reference": "This is the reference summary"}
+```
+If you wish to use column names other than `document` and `summary:reference`, you may specify custom column names
+using the `doc_column` and `reference_column` command-line arguments.
+#### 2. Add predictions to the saved dataset.
+Takes a saved dataset that has already been standardized and adds predictions to it
+from prediction jsonl files. Cached predictions for several models available here:
+ https://storage.googleapis.com/sfr-summvis-data-research/predictions.zip
+You may also generate your own predictions using this [this script](generation.py).
+##### Example: Add 6 prediction files for PEGASUS and BART to the dataset.
+```shell
+python preprocessing.py \
+--join_predictions \
+--dataset_jsonl preprocessing/cnn_dailymail.validation.jsonl \
+--prediction_jsonls \
+predictions/bart-cnndm.cnndm.validation.results.anonymized \
+predictions/bart-xsum.cnndm.validation.results.anonymized \
+predictions/pegasus-cnndm.cnndm.validation.results.anonymized \
+predictions/pegasus-multinews.cnndm.validation.results.anonymized \
+predictions/pegasus-newsroom.cnndm.validation.results.anonymized \
+predictions/pegasus-xsum.cnndm.validation.results.anonymized \
+--save_jsonl_path preprocessing/cnn_dailymail.validation.jsonl
+```
+#### 3. Run the preprocessing workflow and save the dataset.
+Takes a saved dataset that has been standardized, and predictions already added.
+Applies all the preprocessing steps to it (running `spaCy`, lexical and semantic aligners),
+and stores the processed dataset back to disk.
+##### Example: Autorun with default settings on a few examples to try it.
+```shell
+python preprocessing.py \
+--workflow \
+--dataset_jsonl preprocessing/cnn_dailymail.validation.jsonl \
+--processed_dataset_path data/cnn_dailymail.validation \
+--try_it
+```
+##### Example: Autorun with default settings on all examples.
+```shell
+python preprocessing.py \
+--workflow \
+--dataset_jsonl preprocessing/cnn_dailymail.validation.jsonl \
+--processed_dataset_path data/cnn_dailymail
+```
+## Citation
+When referencing this repository, please cite [this paper](https://arxiv.org/abs/2104.07605):
+```
+@misc{vig2021summvis,
+      title={SummVis: Interactive Visual Analysis of Models, Data, and Evaluation for Text Summarization},
+      author={Jesse Vig and Wojciech Kryscinski and Karan Goel and Nazneen Fatema Rajani},
+      year={2021},
+      eprint={2104.07605},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2104.07605}
+}
+```
+## Acknowledgements
+We thank [Michael Correll](http://correll.io) for his valuable feedback.

align.py ADDED Viewed

	@@ -0,0 +1,346 @@

+import heapq
+import itertools
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from operator import itemgetter
+from typing import List, Dict, Tuple
+from typing import Sequence
+import numpy as np
+import torch
+from bert_score import BERTScorer
+from nltk import PorterStemmer
+from spacy.tokens import Doc, Span
+from toolz import itertoolz
+from transformers import AutoTokenizer
+from transformers.tokenization_utils_base import PaddingStrategy
+class EmbeddingModel(ABC):
+    @abstractmethod
+    def embed(
+        self,
+        sents: List[Span]
+    ):
+        pass
+class ContextualEmbedding(EmbeddingModel):
+    def __init__(self, model, tokenizer_name, max_length):
+        self.model = model
+        self.tokenizer = SpacyHuggingfaceTokenizer(tokenizer_name, max_length)
+        self._device = model.device
+    def embed(
+        self,
+        sents: List[Span]
+    ):
+        encoded_input, special_tokens_masks, token_alignments = self.tokenizer.batch_encode(sents)
+        encoded_input = {k: v.to(self._device) for k, v in encoded_input.items()}
+        with torch.no_grad():
+            model_output = self.model(**encoded_input)
+            embeddings = model_output[0].cpu()
+        spacy_embs_list = []
+        for embs, mask, token_alignment \
+            in zip(embeddings, special_tokens_masks, token_alignments):
+            mask = torch.tensor(mask)
+            embs = embs[mask == 0]  # Filter embeddings at special token positions
+            spacy_embs = []
+            for hf_idxs in token_alignment:
+                if hf_idxs is None:
+                    pooled_embs = torch.zeros_like(embs[0])
+                else:
+                    pooled_embs = embs[hf_idxs].mean(dim=0)  # Pool embeddings that map to the same spacy token
+                spacy_embs.append(pooled_embs.numpy())
+            spacy_embs = np.stack(spacy_embs)
+            spacy_embs = spacy_embs / np.linalg.norm(spacy_embs, axis=-1, keepdims=True)  # Normalize
+            spacy_embs_list.append(spacy_embs)
+        for embs, sent in zip(spacy_embs_list, sents):
+            assert len(embs) == len(sent)
+        return spacy_embs_list
+class StaticEmbedding(EmbeddingModel):
+    def embed(
+        self,
+        sents: List[Span]
+    ):
+        return [
+            np.stack([t.vector / (t.vector_norm or 1) for t in sent])
+            for sent in sents
+        ]
+class EmbeddingAligner():
+    def __init__(
+        self,
+        embedding: EmbeddingModel,
+        threshold: float,
+        top_k: int,
+        baseline_val=0
+    ):
+        self.threshold = threshold
+        self.top_k = top_k
+        self.embedding = embedding
+        self.baseline_val = baseline_val
+    def align(
+        self,
+        source: Doc,
+        targets: Sequence[Doc]
+    ) -> List[Dict]:
+        """Compute alignment from summary tokens to doc tokens with greatest semantic similarity
+        Args:
+            source: Source spaCy document
+            targets: Target spaCy documents
+        Returns: List of alignments, one for each target document
+        """
+        if len(source) == 0:
+            return [{} for _ in targets]
+        all_sents = list(source.sents) + list(itertools.chain.from_iterable(target.sents for target in targets))
+        chunk_sizes = [_iter_len(source.sents)] + \
+                      [_iter_len(target.sents) for target in targets]
+        all_sents_token_embeddings = self.embedding.embed(all_sents)
+        chunked_sents_token_embeddings = _split(all_sents_token_embeddings, chunk_sizes)
+        source_sent_token_embeddings = chunked_sents_token_embeddings[0]
+        source_token_embeddings = np.concatenate(source_sent_token_embeddings)
+        for token_idx, token in enumerate(source):
+            if token.is_stop or token.is_punct:
+                source_token_embeddings[token_idx] = 0
+        alignments = []
+        for i, target in enumerate(targets):
+            target_sent_token_embeddings = chunked_sents_token_embeddings[i + 1]
+            target_token_embeddings = np.concatenate(target_sent_token_embeddings)
+            for token_idx, token in enumerate(target):
+                if token.is_stop or token.is_punct:
+                    target_token_embeddings[token_idx] = 0
+            alignment = defaultdict(list)
+            for score, target_idx, source_idx in self._emb_sim_sparse(
+                target_token_embeddings,
+                source_token_embeddings,
+            ):
+                alignment[target_idx].append((source_idx, score))
+            # TODO used argpartition to get nlargest
+            for j in list(alignment):
+                alignment[j] = heapq.nlargest(self.top_k, alignment[j], itemgetter(1))
+            alignments.append(alignment)
+        return alignments
+    def _emb_sim_sparse(self, embs_1, embs_2):
+        sim = embs_1 @ embs_2.T
+        sim = (sim - self.baseline_val) / (1 - self.baseline_val)
+        keep = sim > self.threshold
+        keep_idxs_1, keep_idxs_2 = np.where(keep)
+        keep_scores = sim[keep]
+        return list(zip(keep_scores, keep_idxs_1, keep_idxs_2))
+class BertscoreAligner(EmbeddingAligner):
+    def __init__(
+        self,
+        threshold,
+        top_k
+    ):
+        scorer = BERTScorer(lang="en", rescale_with_baseline=True)
+        model = scorer._model
+        embedding = ContextualEmbedding(model, "roberta-large", 510)
+        baseline_val = scorer.baseline_vals[2].item()
+        super(BertscoreAligner, self).__init__(
+            embedding, threshold, top_k, baseline_val
+        )
+class StaticEmbeddingAligner(EmbeddingAligner):
+    def __init__(
+        self,
+        threshold,
+        top_k
+    ):
+        embedding = StaticEmbedding()
+        super(StaticEmbeddingAligner, self).__init__(
+            embedding, threshold, top_k
+        )
+class NGramAligner():
+    def __init__(self):
+        self.stemmer = PorterStemmer()
+    def align(
+        self,
+        source: Doc,
+        targets: List[Doc],
+    ) -> List[Dict]:
+        alignments = []
+        source_ngram_spans = self._get_ngram_spans(source)
+        for target in targets:
+            target_ngram_spans = self._get_ngram_spans(target)
+            alignments.append(
+                self._align_ngrams(target_ngram_spans, source_ngram_spans)
+            )
+        return alignments
+    def _get_ngram_spans(
+        self,
+        doc: Doc,
+    ):
+        ngrams = []
+        for sent in doc.sents:
+            for n in range(1, len(list(sent))):
+                tokens = [t for t in sent if not (t.is_stop or t.is_punct)]
+                ngrams.extend(_ngrams(tokens, n))
+        def ngram_key(ngram):
+            return tuple(self.stemmer.stem(token.text).lower() for token in ngram)
+        key_to_ngrams = itertoolz.groupby(ngram_key, ngrams)
+        key_to_spans = {}
+        for k, grouped_ngrams in key_to_ngrams.items():
+            key_to_spans[k] = [
+                (ngram[0].i, ngram[-1].i + 1)
+                for ngram in grouped_ngrams
+            ]
+        return key_to_spans
+    def _align_ngrams(
+        self,
+        ngram_spans_1: Dict[Tuple[str], List[Tuple[int, int]]],
+        ngram_spans_2: Dict[Tuple[str], List[Tuple[int, int]]]
+    ) -> Dict[Tuple[int, int], List[Tuple[int, int]]]:
+        """Align ngram spans between two documents
+        Args:
+            ngram_spans_1: Map from (normalized_token1, normalized_token2, ...) n-gram tuple to a list of token spans
+                of format (start_pos, end_pos)
+            ngram_spans_2: Same format as above, but for second text
+        Returns: map from each (start, end) span in text 1 to list of aligned (start, end) spans in text 2
+        """
+        if not ngram_spans_1 or not ngram_spans_2:
+            return {}
+        max_span_end_1 = max(span[1] for span in itertools.chain.from_iterable(ngram_spans_1.values()))
+        token_is_available_1 = [True] * max_span_end_1  #
+        matched_keys = list(set(ngram_spans_1.keys()) & set(ngram_spans_2.keys()))  # Matched normalized ngrams betwee
+        matched_keys.sort(key=len, reverse=True)  # Process n-grams from longest to shortest
+        alignment = defaultdict(list)  # Map from each matched span in text 1 to list of aligned spans in text 2
+        for key in matched_keys:
+            spans_1 = ngram_spans_1[key]
+            spans_2 = ngram_spans_2[key]
+            available_spans_1 = [span for span in spans_1 if all(token_is_available_1[slice(*span)])]
+            matched_spans_1 = []
+            if available_spans_1 and spans_2:
+                # if ngram can be matched to available spans in both sequences
+                for span in available_spans_1:
+                    # It's possible that these newly matched spans may be overlapping with one another, so
+                    # check that token positions still available (only one span allowed ber token in text 1):
+                    if all(token_is_available_1[slice(*span)]):
+                        matched_spans_1.append(span)
+                        token_is_available_1[slice(*span)] = [False] * (span[1] - span[0])
+            for span1 in matched_spans_1:
+                alignment[span1] = spans_2
+        return alignment
+class SpacyHuggingfaceTokenizer:
+    def __init__(
+        self,
+        model_name,
+        max_length
+    ):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
+        self.max_length = max_length
+    def batch_encode(
+        self,
+        sents: List[Span]
+    ):
+        token_alignments = []
+        token_ids_list = []
+        # Tokenize each sentence and special tokens.
+        for sent in sents:
+            hf_tokens, token_alignment = self.tokenize(sent)
+            token_alignments.append(token_alignment)
+            token_ids = self.tokenizer.convert_tokens_to_ids(hf_tokens)
+            encoding = self.tokenizer.prepare_for_model(
+                token_ids,
+                add_special_tokens=True,
+                padding=False,
+            )
+            token_ids_list.append(encoding['input_ids'])
+        # Add padding
+        max_length = max(map(len, token_ids_list))
+        attention_mask = []
+        input_ids = []
+        special_tokens_masks = []
+        for token_ids in token_ids_list:
+            encoding = self.tokenizer.prepare_for_model(
+                token_ids,
+                padding=PaddingStrategy.MAX_LENGTH,
+                max_length=max_length,
+                add_special_tokens=False
+            )
+            input_ids.append(encoding['input_ids'])
+            attention_mask.append(encoding['attention_mask'])
+            special_tokens_masks.append(
+                self.tokenizer.get_special_tokens_mask(
+                    encoding['input_ids'],
+                    already_has_special_tokens=True
+                )
+            )
+        encoded = {
+            'input_ids': torch.tensor(input_ids),
+            'attention_mask': torch.tensor(attention_mask)
+        }
+        return encoded, special_tokens_masks, token_alignments
+    def tokenize(
+        self,
+        sent
+    ):
+        """Convert spacy sentence to huggingface tokens and compute the alignment"""
+        hf_tokens = []
+        token_alignment = []
+        for i, token in enumerate(sent):
+            # "Tokenize" each word individually, so as to track the alignment between spaCy/HF tokens
+            # Prefix all tokens with a space except the first one in the sentence
+            if i == 0:
+                token_text = token.text
+            else:
+                token_text = ' ' + token.text
+            start_hf_idx = len(hf_tokens)
+            word_tokens = self.tokenizer.tokenize(token_text)
+            end_hf_idx = len(hf_tokens) + len(word_tokens)
+            if end_hf_idx < self.max_length:
+                hf_tokens.extend(word_tokens)
+                hf_idxs = list(range(start_hf_idx, end_hf_idx))
+            else:
+                hf_idxs = None
+            token_alignment.append(hf_idxs)
+        return hf_tokens, token_alignment
+def _split(data, sizes):
+    it = iter(data)
+    return [[next(it) for _ in range(size)] for size in sizes]
+def _iter_len(it):
+    return sum(1 for _ in it)
+    # TODO set up batching
+    # To get top K axis and value per row: https://stackoverflow.com/questions/42832711/using-np-argpartition-to-index-values-in-a-multidimensional-array
+def _ngrams(tokens, n):
+    for i in range(len(tokens) - n + 1):
+        yield tokens[i:i + n]

app.py ADDED Viewed

	@@ -0,0 +1,375 @@

+import argparse
+import json
+import operator
+import os
+import re
+from pathlib import Path
+import spacy
+import streamlit as st
+from robustnessgym import Dataset, Identifier
+from robustnessgym import Spacy
+from spacy.tokens import Doc
+from align import NGramAligner, BertscoreAligner, StaticEmbeddingAligner
+from components import MainView
+from preprocessing import NGramAlignerCap, StaticEmbeddingAlignerCap, \
+    BertscoreAlignerCap
+from preprocessing import _spacy_decode, _spacy_encode
+from utils import preprocess_text
+MIN_SEMANTIC_SIM_THRESHOLD = 0.1
+MAX_SEMANTIC_SIM_TOP_K = 10
+Doc.set_extension("name", default=None, force=True)
+Doc.set_extension("column", default=None, force=True)
+class Instance():
+    def __init__(self, id_, document, reference, preds, data=None):
+        self.id = id_
+        self.document = document
+        self.reference = reference
+        self.preds = preds
+        self.data = data
+@st.cache(allow_output_mutation=True)
+def load_from_index(filename, index):
+    with open(filename) as f:
+        for i, line in enumerate(f):
+            if i == index:
+                return json.loads(line.strip())
+@st.cache(allow_output_mutation=True)
+def load_dataset(path: str):
+    if path.endswith('.jsonl'):
+        return Dataset.from_jsonl(path)
+    try:
+        return Dataset.load_from_disk(path)
+    except NotADirectoryError:
+        return Dataset.from_jsonl(path)
+@st.cache(allow_output_mutation=True)
+def get_nlp():
+    os.popen('python -m spacy download en_core_web_sm').read()
+    try:
+        nlp = spacy.load("en_core_web_lg")
+    except:
+        nlp = spacy.load("en_core_web_sm")
+        is_lg = False
+    else:
+        is_lg = True
+    nlp.add_pipe('sentencizer', before="parser")
+    return nlp, is_lg
+def retrieve(dataset, index, filename=None):
+    if index >= len(dataset):
+        st.error(f"Index {index} exceeds dataset length.")
+    eval_dataset = None
+    if filename:
+        # TODO Handle this through dedicated fields
+        if "cnn_dailymail" in filename:
+            eval_dataset = "cnndm"
+        elif "xsum" in filename:
+            eval_dataset = "xsum"
+    data = dataset[index]
+    id_ = data.get('id', '')
+    try:
+        document = rg_spacy.decode(
+            data[rg_spacy.identifier(columns=['preprocessed_document'])]
+        )
+    except KeyError:
+        if not is_lg:
+            st.error("'en_core_web_lg model' is required unless loading from cached file."
+                     "To install: 'python -m spacy download en_core_web_lg'")
+        try:
+            text = data['document']
+        except KeyError:
+            text = data['article']
+        if not text:
+            st.error("Document is blank")
+            return
+        document = nlp(preprocess_text(text))
+    document._.name = "Document"
+    document._.column = "document"
+    try:
+        reference = rg_spacy.decode(
+            data[rg_spacy.identifier(columns=['preprocessed_summary:reference'])]
+        )
+    except KeyError:
+        if not is_lg:
+            st.error("'en_core_web_lg model' is required unless loading from cached file."
+                     "To install: 'python -m spacy download en_core_web_lg'")
+        try:
+            text = data['summary'] if 'summary' in data else data['summary:reference']
+        except KeyError:
+            text = data.get('highlights')
+        if text:
+            reference = nlp(preprocess_text(text))
+        else:
+            reference = None
+    if reference is not None:
+        reference._.name = "Reference"
+        reference._.column = "summary:reference"
+    model_names = set()
+    for k in data:
+        m = re.match('(preprocessed_)?summary:(?P<model>.*)', k)
+        if m:
+            model_name = m.group('model')
+            if model_name != 'reference':
+                model_names.add(model_name)
+    preds = []
+    for model_name in model_names:
+        try:
+            pred = rg_spacy.decode(
+                data[rg_spacy.identifier(columns=[f"preprocessed_summary:{model_name}"])]
+            )
+        except KeyError:
+            if not is_lg:
+                st.error("'en_core_web_lg model' is required unless loading from cached file."
+                         "To install: 'python -m spacy download en_core_web_lg'")
+            pred = nlp(preprocess_text(data[f"summary:{model_name}"]))
+        parts = model_name.split("-")
+        primary_sort = 0
+        if len(parts) == 2:
+            model, train_dataset = parts
+            if train_dataset == eval_dataset:
+                formatted_model_name = model.upper()
+            else:
+                formatted_model_name = f"{model.upper()} ({train_dataset.upper()}-trained)"
+                if train_dataset in ["xsum", "cnndm"]:
+                    primary_sort = 1
+                else:
+                    primary_sort = 2
+        else:
+            formatted_model_name = model_name.upper()
+        pred._.name = formatted_model_name
+        pred._.column = f"summary:{model_name}"
+        preds.append(
+            ((primary_sort, formatted_model_name), pred)
+        )
+    preds = [pred for _, pred in sorted(preds)]
+    return Instance(
+        id_=id_,
+        document=document,
+        reference=reference,
+        preds=preds,
+        data=data,
+    )
+def filter_alignment(alignment, threshold, top_k):
+    filtered_alignment = {}
+    for k, v in alignment.items():
+        filtered_matches = [(match_idx, score) for match_idx, score in v if score >= threshold]
+        if filtered_matches:
+            filtered_alignment[k] = sorted(filtered_matches, key=operator.itemgetter(1), reverse=True)[:top_k]
+    return filtered_alignment
+def select_comparison(example):
+    all_summaries = []
+    if example.reference:
+        all_summaries.append(example.reference)
+    if example.preds:
+        all_summaries.extend(example.preds)
+    from_documents = [example.document]
+    if example.reference:
+        from_documents.append(example.reference)
+    document_names = [document._.name for document in from_documents]
+    select_document_name = sidebar_placeholder_from.selectbox(
+        label="Comparison FROM:",
+        options=document_names
+    )
+    document_index = document_names.index(select_document_name)
+    selected_document = from_documents[document_index]
+    remaining_summaries = [summary for summary in all_summaries if
+                           summary._.name != selected_document._.name]
+    remaining_summary_names = [summary._.name for summary in remaining_summaries]
+    selected_summary_names = sidebar_placeholder_to.multiselect(
+        'Comparison TO:',
+        remaining_summary_names,
+        remaining_summary_names
+    )
+    selected_summaries = []
+    for summary_name in selected_summary_names:
+        summary_index = remaining_summary_names.index(summary_name)
+        selected_summaries.append(remaining_summaries[summary_index])
+    return selected_document, selected_summaries
+def show_main(example):
+    # Get user input
+    semantic_sim_type = st.sidebar.radio(
+        "Semantic similarity type:",
+        ["Contextual embedding", "Static embedding"]
+    )
+    semantic_sim_threshold = st.sidebar.slider(
+        "Semantic similarity threshold:",
+        min_value=MIN_SEMANTIC_SIM_THRESHOLD,
+        max_value=1.0,
+        step=0.1,
+        value=0.2,
+    )
+    semantic_sim_top_k = st.sidebar.slider(
+        "Semantic similarity top-k:",
+        min_value=1,
+        max_value=MAX_SEMANTIC_SIM_TOP_K,
+        step=1,
+        value=10,
+    )
+    document, summaries = select_comparison(example)
+    layout = st.sidebar.radio("Layout:", ["Vertical", "Horizontal"]).lower()
+    # if layout == "horizontal":
+    #     scroll = st.sidebar.checkbox(label="Scroll sections", value=True)
+    # else:
+    scroll = True
+    gray_out_stopwords = st.sidebar.checkbox(label="Gray out stopwords", value=True)
+    # Gather data
+    try:
+        lexical_alignments = [
+            NGramAlignerCap.decode(
+                example.data[
+                    Identifier(NGramAlignerCap.__name__)(
+                        columns=[
+                            f'preprocessed_{document._.column}',
+                            f'preprocessed_{summary._.column}',
+                        ]
+                    )
+                ])[0]
+            for summary in summaries
+        ]
+        lexical_alignments = [
+            {k: [(pair[0], int(pair[1])) for pair in v]
+             for k, v in d.items()}
+            for d in lexical_alignments
+        ]
+    except KeyError:
+        lexical_alignments = NGramAligner().align(document, summaries)
+    if semantic_sim_type == "Static embedding":
+        try:
+            semantic_alignments = [
+                StaticEmbeddingAlignerCap.decode(
+                    example.data[
+                        Identifier(StaticEmbeddingAlignerCap.__name__)(
+                            threshold=MIN_SEMANTIC_SIM_THRESHOLD,
+                            top_k=MAX_SEMANTIC_SIM_TOP_K,
+                            columns=[
+                                f'preprocessed_{document._.column}',
+                                f'preprocessed_{summary._.column}',
+                            ]
+                        )
+                    ])[0]
+                for summary in summaries
+            ]
+        except KeyError:
+            semantic_alignments = StaticEmbeddingAligner(
+                semantic_sim_threshold,
+                semantic_sim_top_k).align(
+                document,
+                summaries
+            )
+        else:
+            semantic_alignments = [
+                filter_alignment(alignment, semantic_sim_threshold, semantic_sim_top_k)
+                for alignment in semantic_alignments
+            ]
+    else:
+        try:
+            semantic_alignments = [
+                BertscoreAlignerCap.decode(
+                    example.data[
+                        Identifier(BertscoreAlignerCap.__name__)(
+                            threshold=MIN_SEMANTIC_SIM_THRESHOLD,
+                            top_k=MAX_SEMANTIC_SIM_TOP_K,
+                            columns=[
+                                f'preprocessed_{document._.column}',
+                                f'preprocessed_{summary._.column}',
+                            ]
+                        )
+                    ])[0]
+                for summary in summaries
+            ]
+        except KeyError:
+            semantic_alignments = BertscoreAligner(semantic_sim_threshold,
+                                                   semantic_sim_top_k).align(document,
+                                                                             summaries)
+        else:
+            semantic_alignments = [
+                filter_alignment(alignment, semantic_sim_threshold, semantic_sim_top_k)
+                for alignment in semantic_alignments
+            ]
+    MainView(
+        document,
+        summaries,
+        semantic_alignments,
+        lexical_alignments,
+        layout,
+        scroll,
+        gray_out_stopwords,
+    ).show(height=720)
+if __name__ == "__main__":
+    st.set_page_config(layout="wide")
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--path', type=str, default='data')
+    parser.add_argument('--file', type=str)
+    args = parser.parse_args()
+    nlp, is_lg = get_nlp()
+    Spacy.encode = _spacy_encode
+    Spacy.decode = _spacy_decode
+    rg_spacy = Spacy(nlp=nlp)
+    path = Path(args.path)
+    all_files = set(map(os.path.basename, path.glob('*')))
+    files = sorted([
+        fname for fname in all_files if not (fname.endswith(".py") or fname.startswith("."))
+    ])
+    if args.file:
+        try:
+            file_index = files.index(args.input)
+        except:
+            raise FileNotFoundError(f"File not found: {args.input}")
+    else:
+        file_index = 0
+        col1, col2 = st.beta_columns((3, 1))
+    filename = col1.selectbox(label="File:", options=files, index=file_index)
+    dataset = load_dataset(str(path / filename))
+    dataset_size = len(dataset)
+    query = col2.number_input(f"Index (Size: {dataset_size}):", value=0, min_value=0, max_value=dataset_size - 1)
+    sidebar_placeholder_from = st.sidebar.empty()
+    sidebar_placeholder_to = st.sidebar.empty()
+    if query is not None:
+        example = retrieve(dataset, query, filename)
+        if example:
+            show_main(example)

components.py ADDED Viewed

	@@ -0,0 +1,563 @@

+from pathlib import Path
+from collections import defaultdict
+from itertools import count
+from operator import itemgetter
+from pathlib import Path
+from typing import Dict, Optional
+from typing import List, Tuple, Union
+import htbuilder
+from htbuilder import span, script, style, link, div, styles, HtmlElement
+from htbuilder.units import px
+from spacy.tokens import Doc
+import streamlit as st
+palette = [
+    "#66c2a5",
+    "#fc8d62",
+    "#8da0cb",
+    "#e78ac3",
+    "#a6d854",
+    "#ffd92f",
+    "#e5c494",
+    "#b3b3b3",
+]
+inactive_color = "#BBB"
+def local_stylesheet(path):
+    with open(path) as f:
+        css = f.read()
+    return style()(
+        css
+    )
+def remote_stylesheet(url):
+    return link(
+        href=url
+    )
+def local_script(path):
+    with open(path) as f:
+        code = f.read()
+    return script()(
+        code
+    )
+def remote_script(url):
+    return script(
+        src=url
+    )
+def get_color(sent_idx):
+    return palette[sent_idx % len(palette)]
+def hex_to_rgb(hex):
+    hex = hex.replace("#", '')
+    return tuple(int(hex[i:i + 2], 16) for i in (0, 2, 4))
+def color_with_opacity(hex_color, opacity):
+    rgb = hex_to_rgb(hex_color)
+    return f"rgba({rgb[0]},{rgb[1]},{rgb[2]},{opacity:.2f})"
+class Component:
+    def show(self, width=None, height=None, scrolling=True, **kwargs):
+        out = div(style=styles(
+            **kwargs
+        ))(self.html())
+        html = str(out)
+        st.components.v1.html(html, width=width, height=height, scrolling=scrolling)
+    def html(self):
+        raise NotImplemented
+class MainView(Component):
+    def __init__(
+        self,
+        document: Doc,
+        summaries: List[Doc],
+        semantic_alignments: Optional[List[Dict]],
+        lexical_alignments: Optional[List[Dict]],
+        layout: str,
+        scroll: bool,
+        gray_out_stopwords: bool
+    ):
+        self.document = document
+        self.summaries = summaries
+        self.semantic_alignments = semantic_alignments
+        self.lexical_alignments = lexical_alignments
+        self.layout = layout
+        self.scroll = scroll
+        self.gray_out_stopwords = gray_out_stopwords
+    def html(self):
+        # Add document elements
+        if self.document._.name == 'Document':
+            document_name = 'Source Document'
+        else:
+            document_name = self.document._.name + ' summary'
+        doc_header = div(
+            id_="document-header"
+        )(
+            document_name
+        )
+        doc_elements = []
+        # Add document content, which comprises multiple elements, one for each summary. Only the elment corresponding to
+        # selected summary will be visible.
+        mu = MultiUnderline()
+        for summary_idx, summary in enumerate(self.summaries):
+            token_idx_to_sent_idx = {}
+            for sent_idx, sent in enumerate(summary.sents):
+                for token in sent:
+                    token_idx_to_sent_idx[token.i] = sent_idx
+            is_selected_summary = (summary_idx == 0)  # By default, first summary is selected
+            if self.semantic_alignments is not None:
+                doc_token_idx_to_matches = defaultdict(list)
+                semantic_alignment = self.semantic_alignments[summary_idx]
+                for summary_token_idx, matches in semantic_alignment.items():
+                    for doc_token_idx, sim in matches:
+                        doc_token_idx_to_matches[doc_token_idx].append((summary_token_idx, sim))
+            else:
+                doc_token_idx_to_matches = {}
+            token_elements = []
+            for doc_token_idx, doc_token in enumerate(self.document):
+                if doc_token.is_stop or doc_token.is_punct:
+                    classes = ["stopword"]
+                    if self.gray_out_stopwords:
+                        classes.append("grayed-out")
+                    el = span(
+                        _class=" ".join(classes)
+                    )(
+                        doc_token.text
+                    )
+                else:
+                    matches = doc_token_idx_to_matches.get(doc_token_idx)
+                    if matches:
+                        summary_token_idx, sim = max(matches, key=itemgetter(1))
+                        sent_idx = token_idx_to_sent_idx[summary_token_idx]
+                        color_primary = get_color(sent_idx)
+                        highlight_color_primary = color_with_opacity(color_primary, sim)
+                        props = {
+                            'data-highlight-id': str(doc_token_idx),
+                            'data-primary-color': highlight_color_primary
+                        }
+                        match_classes = []
+                        for summary_token_idx, sim in matches:
+                            sent_idx = token_idx_to_sent_idx[summary_token_idx]
+                            match_classes.append(f"summary-highlight-{summary_idx}-{summary_token_idx}")
+                            color = color_with_opacity(get_color(sent_idx), sim)
+                            props[f"data-color-{summary_idx}-{summary_token_idx}"] = color
+                        props["data-match-classes"] = " ".join(match_classes)
+                        el = self._highlight(
+                            doc_token.text,
+                            highlight_color_primary,
+                            color_primary,
+                            match_classes + ["annotation-hidden"],
+                            **props
+                        )
+                    else:
+                        el = doc_token.text
+                token_elements.append(el)
+            spans = []
+            if self.lexical_alignments is not None:
+                lexical_alignment = self.lexical_alignments[summary_idx]
+                for summary_span, doc_spans in lexical_alignment.items():
+                    summary_span_start, summary_span_end = summary_span
+                    span_id = f"{summary_idx}-{summary_span_start}-{summary_span_end}"
+                    sent_idx = token_idx_to_sent_idx[summary_span_start]
+                    for doc_span_start, doc_span_end in doc_spans:
+                        spans.append((
+                            doc_span_start,
+                            doc_span_end,
+                            sent_idx,
+                            get_color(sent_idx),
+                            span_id
+                        ))
+            token_elements = mu.markup(token_elements, spans)
+            classes = ["main-doc", "bordered"]
+            if self.scroll:
+                classes.append("scroll")
+            main_doc = div(
+                _class=" ".join(classes)
+            )(
+                token_elements
+            ),
+            classes = ["doc"]
+            if is_selected_summary:
+                classes.append("display")
+            else:
+                classes.append("nodisplay")
+            doc_elements.append(
+                div(
+                    **{
+                        "class": " ".join(classes),
+                        "data-index": summary_idx
+                    }
+                )(
+                    main_doc,
+                    div(_class="proxy-doc"),
+                    div(_class="proxy-scroll")
+                )
+            )
+        summary_title = "Summary"
+        summary_header = div(
+            id_="summary-header"
+        )(
+            summary_title,
+            div(id="summary-header-gap"),
+        )
+        summary_items = []
+        for summary_idx, summary in enumerate(self.summaries):
+            token_idx_to_sent_idx = {}
+            for sent_idx, sent in enumerate(summary.sents):
+                for token in sent:
+                    token_idx_to_sent_idx[token.i] = sent_idx
+            spans = []
+            matches_ngram = [False] * len(list(summary))
+            if self.lexical_alignments is not None:
+                lexical_alignment = self.lexical_alignments[summary_idx]
+                for summary_span in lexical_alignment.keys():
+                    start, end = summary_span
+                    matches_ngram[slice(start, end)] = [True] * (end - start)
+                    span_id = f"{summary_idx}-{start}-{end}"
+                    sent_idx = token_idx_to_sent_idx[start]
+                    spans.append((
+                        start,
+                        end,
+                        sent_idx,
+                        get_color(sent_idx),
+                        span_id
+                    ))
+            if self.semantic_alignments is not None:
+                semantic_alignment = self.semantic_alignments[summary_idx]
+            else:
+                semantic_alignment = {}
+            token_elements = []
+            for token_idx, token in enumerate(summary):
+                if token.is_stop or token.is_punct:
+                    classes = ["stopword"]
+                    if self.gray_out_stopwords:
+                        classes.append("grayed-out")
+                    el = span(
+                        _class=" ".join(classes)
+                    )(
+                        token.text
+                    )
+                else:
+                    classes = []
+                    if token.ent_iob_ in ('I', 'B'):
+                        classes.append("entity")
+                    if matches_ngram[token_idx]:
+                        classes.append("matches-ngram")
+                    matches = semantic_alignment.get(token_idx)
+                    if matches:
+                        top_match = max(matches, key=itemgetter(1))
+                        top_sim = max(top_match[1], 0)
+                        top_doc_token_idx = top_match[0]
+                        props = {
+                            "data-highlight-id": f"{summary_idx}-{token_idx}",
+                            "data-top-doc-highlight-id": str(top_doc_token_idx),
+                            "data-top-doc-sim": f"{top_sim:.2f}",
+                        }
+                        classes.extend([
+                            "annotation-hidden",
+                            f"summary-highlight-{summary_idx}-{token_idx}"
+                        ])
+                        sent_idx = token_idx_to_sent_idx[token_idx]
+                        el = self._highlight(
+                            token.text,
+                            color_with_opacity(get_color(sent_idx), top_sim),
+                            color_with_opacity(get_color(sent_idx), 1),
+                            classes,
+                            **props
+                        )
+                    else:
+                        if classes:
+                            el = span(_class=" ".join(classes))(token.text)
+                        else:
+                            el = token.text
+                token_elements.append(el)
+            token_elements = mu.markup(token_elements, spans)
+            classes = ["summary-item"]
+            if summary_idx == 0:  # Default is for first summary to be selected
+                classes.append("selected")
+            summary_items.append(
+                div(
+                    **{"class": ' '.join(classes), "data-index": summary_idx}
+                )(
+                    div(_class="name")(summary._.name),
+                    div(_class="content")(token_elements)
+                )
+            )
+        classes = ["summary-list", "bordered"]
+        if self.scroll:
+            classes.append("scroll")
+        if self.lexical_alignments is not None:
+            classes.append("has-lexical-alignment")
+        if self.semantic_alignments is not None:
+            classes.append("has-semantic-alignment")
+        summary_list = div(
+            _class=" ".join(classes)
+        )(
+            summary_items
+        )
+        annotation_key = \
+            """
+              <ul class="annotation-key">
+                <li class="annotation-key-label">Annotations:</li>
+                <li id="option-lexical" class="option selected">
+                    <span class="annotation-key-ngram">N-Gram overlap</span>
+                </li>
+                <li id="option-semantic" class="option selected">
+                    <span class="annotation-key-semantic">Semantic overlap</span>
+                </li>
+                <li id="option-novel" class="option selected">
+                    <span class="annotation-key-novel">Novel words</span>
+                </li>
+                <li id="option-entity" class="option selected">
+                    <span class="annotation-key-entity">Novel entities</span>
+                </li>
+            </ul>
+            """
+        body = div(
+            annotation_key,
+            div(
+                _class=f"vis-container {self.layout}-layout"
+            )(
+                div(
+                    _class="doc-container"
+                )(
+                    doc_header,
+                    *doc_elements
+                ),
+                div(
+                    _class="summary-container"
+                )(
+                    summary_header,
+                    summary_list
+                )
+            ),
+        )
+        return [
+            """<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">""",
+            local_stylesheet(Path(__file__).parent / "resources" / "summvis.css"),
+            """<link rel="preconnect" href="https://fonts.gstatic.com">
+                <link href="https://fonts.googleapis.com/css2?family=Roboto:wght@400;500&display=swap" rel="stylesheet">""",
+            body,
+            """<script
+                src="https://code.jquery.com/jquery-3.5.1.min.js"
+                integrity="sha256-9/aliU8dGd2tb6OSsuzixeV4y/faTqgFtohetphbbj0="
+                crossorigin="anonymous"></script>
+                <script src="https://cdn.jsdelivr.net/npm/bootstrap@4.6.0/dist/js/bootstrap.bundle.min.js"
+                 integrity="sha384-Piv4xVNRyMGpqkS2by6br4gNJ7DXjqk09RmUpJ8jgGtD7zP9yug3goQfGII0yAns"
+                  crossorigin="anonymous"></script>""",
+            local_script(Path(__file__).parent / "resources" / "jquery.color-2.1.2.min.js"),
+            local_script(Path(__file__).parent / "resources" / "summvis.js"),
+            """<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/js/bootstrap.bundle.min.js" integrity="sha384-gtEjrD/SeCtmISkJkNUaaKMoLD0//ElJ19smozuHV6z3Iehds+3Ulb9Bn9Plx0x4" crossorigin="anonymous"></script>"""
+        ]
+    def _highlight(
+        self,
+        token: Union[str, HtmlElement],
+        background_color,
+        dotted_underline_color,
+        classes: List[str],
+        **props
+    ):
+        return span(
+            _class=" ".join(classes + ["highlight"]),
+            style=styles(
+                background_color=background_color,
+                border_bottom=f"4px dotted {dotted_underline_color}",
+            ),
+            **props
+        )(token)
+SPACE = "&ensp;"
+class MultiUnderline:
+    def __init__(
+        self,
+        underline_thickness=3,
+        underline_spacing=1
+    ):
+        self.underline_thickness = underline_thickness
+        self.underline_spacing = underline_spacing
+    def markup(
+        self,
+        tokens: List[Union[str, HtmlElement]],
+        spans: List[Tuple[int, int, int, str, str]]
+    ):
+        """Style text with multiple layers of colored underlines.
+            Args:
+                tokens: list of tokens, either string or html element
+                spans: list of (start_pos, end_pos, rank, color, id) tuples defined as:
+                    start_pos: start position of underline span
+                    end_pos: end position of underline span
+                    rank: rank for stacking order of underlines, all else being equal
+                    color: color of underline
+                    id: id of underline (encoded as a class label in resulting html element)
+            Returns:
+                List of HTML elements
+        """
+        # Map from span start position to span
+        start_to_spans = defaultdict(list)
+        for span in spans:
+            start = span[0]
+            start_to_spans[start].append(span)
+        # Map from each underline slot position to list of active spans
+        slot_to_spans = {}
+        # Collection of html elements
+        elements = []
+        for pos, token in enumerate(tokens):
+            # Remove spans that are no longer active (end < pos)
+            slot_to_spans = defaultdict(
+                list,
+                {
+                    slot: [span for span in spans if span[1] > pos]  # span[1] contains end of spans
+                    for slot, spans in slot_to_spans.items() if spans
+                }
+            )
+            # Add underlines to space between tokens for any continuing underlines
+            if pos > 0:
+                elements.append(self._get_underline_element(SPACE, slot_to_spans))
+            # Find slot for any new spans
+            new_spans = start_to_spans.pop(pos, None)
+            if new_spans:
+                new_spans.sort(
+                    key=lambda span: (-(span[1] - span[0]), span[2]))  # Sort by span length (reversed), rank
+                for new_span in new_spans:
+                    # Find an existing slot or add a new one
+                    for slot, spans in sorted(slot_to_spans.items(), key=itemgetter(0)):  # Sort by slot index
+                        if spans:
+                            containing_span = spans[
+                                0]  # The first span in the slot strictly contains all other spans
+                            containing_start, containing_end = containing_span[0:2]
+                            containing_color = containing_span[3]
+                            start, end = new_span[0:2]
+                            color = new_span[3]
+                            # If the new span (1) is strictly contained in this span, or (2) exactly matches this span
+                            # and is the same color, then add span to this slot
+                            if end <= containing_end and (
+                                (start > containing_start or end < containing_end) or
+                                (start == containing_start and end == containing_end and color == containing_color)
+                            ):
+                                spans.append(new_span)
+                                break
+                    else:
+                        # Find a new slot index to add the span
+                        for slot_index in count():
+                            spans = slot_to_spans[slot_index]
+                            if not spans:  # If slot is free, take it
+                                spans.append(new_span)
+                                break
+            # Add underlines to token for all active spans
+            elements.append(self._get_underline_element(token, slot_to_spans))
+        return elements
+    def _get_underline_element(self, token, slot_to_spans):
+        if not slot_to_spans:
+            return token
+        max_slot_index = max(slot_to_spans.keys())
+        element = token
+        for slot_index in range(max_slot_index + 1):
+            spans = slot_to_spans[slot_index]
+            if not spans:
+                color = "rgba(0, 0, 0, 0)"  # Transparent element w/opacity=0
+                props = {}
+            else:
+                containing_slot = spans[0]
+                color = containing_slot[3]
+                classes = ["underline"]
+                if token != SPACE:
+                    classes.append("token-underline")
+                classes.extend([f"span-{span[4]}" for span in spans])  # Encode ids in class names
+                props = {
+                    "class": " ".join(classes),
+                    "data-primary-color": color
+                }
+            if slot_index == 0:
+                padding_bottom = 0
+            else:
+                padding_bottom = self.underline_spacing
+            display = "inline-block"
+            element = htbuilder.span(
+                style=styles(
+                    display=display,
+                    border_bottom=f"{self.underline_thickness}px solid",
+                    border_color=color,
+                    padding_bottom=px(padding_bottom),
+                ),
+                **props
+            )(element)
+            # Return outermost nested span
+        return element
+if __name__ == "__main__":
+    from htbuilder import div
+    # Test
+    text = "The quick brown fox jumps"
+    tokens = text.split()
+    tokens = [
+        "The",
+        htbuilder.span(style=styles(color="red"))("quick"),
+        "brown",
+        "fox",
+        "jumps"
+    ]
+    spans = [
+        (0, 2, 0, "green", "green1"),
+        (1, 3, 0, "orange", "orange1"),
+        (3, 4, 0, "red", "red1"),
+        (2, 4, 0, "blue", "blue1"),
+        (1, 5, 0, "orange", "orange1"),
+    ]
+    mu = MultiUnderline()
+    html = str(div(mu.markup(tokens, spans)))
+    print(html)

data/10:cnn_dailymail_1000.validation/_dataset/data.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a67d16f8fade54f8e2d525ea80a5dd44af5b7070811e76ca0b1d281931505f8e
+size 679738

data/10:cnn_dailymail_1000.validation/metadata.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"interactions": "{\"cachedoperations\": \"{\\\"[\\\\\\\"{\\\\\\\\\\\\\\\"_name\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"Spacy\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"_index\\\\\\\\\\\\\\\": null, \\\\\\\\\\\\\\\"_parameters\\\\\\\\\\\\\\\": {\\\\\\\\\\\\\\\"lang\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"en\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"pipeline\\\\\\\\\\\\\\\": [\\\\\\\\\\\\\\\"tok2vec\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"tagger\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"sentencizer\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"parser\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"ner\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"attribute_ruler\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"lemmatizer\\\\\\\\\\\\\\\"]}}\\\\\\\", \\\\\\\"preprocessed_summary:bart-cnndm\\\\\\\"]\\\": 2, \\\"[\\\\\\\"{\\\\\\\\\\\\\\\"_name\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"Spacy\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"_index\\\\\\\\\\\\\\\": null, \\\\\\\\\\\\\\\"_parameters\\\\\\\\\\\\\\\": {\\\\\\\\\\\\\\\"lang\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"en\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"pipeline\\\\\\\\\\\\\\\": [\\\\\\\\\\\\\\\"tok2vec\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"tagger\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"sentencizer\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"parser\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"ner\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"attribute_ruler\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"lemmatizer\\\\\\\\\\\\\\\"]}}\\\\\\\", \\\\\\\"preprocessed_summary:bart-xsum\\\\\\\"]\\\": 3, \\\"[\\\\\\\"{\\\\\\\\\\\\\\\"_name\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"Spacy\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"_index\\\\\\\\\\\\\\\": null, \\\\\\\\\\\\\\\"_parameters\\\\\\\\\\\\\\\": {\\\\\\\\\\\\\\\"lang\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"en\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"pipeline\\\\\\\\\\\\\\\": [\\\\\\\\\\\\\\\"tok2vec\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"tagger\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"sentencizer\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"parser\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"ner\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"attribute_ruler\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"lemmatizer\\\\\\\\\\\\\\\"]}}\\\\\\\", \\\\\\\"preprocessed_summary:pegasus-cnndm\\\\\\\"]\\\": 4, \\\"[\\\\\\\"{\\\\\\\\\\\\\\\"_name\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"Spacy\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"_index\\\\\\\\\\\\\\\": null, \\\\\\\\\\\\\\\"_parameters\\\\\\\\\\\\\\\": {\\\\\\\\\\\\\\\"lang\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"en\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"pipeline\\\\\\\\\\\\\\\": [\\\\\\\\\\\\\\\"tok2vec\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"tagger\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"sentencizer\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"parser\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"ner\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"attribute_ruler\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"lemmatizer\\\\\\\\\\\\\\\"]}}\\\\\\\", \\\\\\\"preprocessed_summary:pegasus-multinews\\\\\\\"]\\\": 5, \\\"[\\\\\\\"{\\\\\\\\\\\\\\\"_name\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"Spacy\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"_index\\\\\\\\\\\\\\\": null, \\\\\\\\\\\\\\\"_parameters\\\\\\\\\\\\\\\": {\\\\\\\\\\\\\\\"lang\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"en\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"pipeline\\\\\\\\\\\\\\\": [\\\\\\\\\\\\\\\"tok2vec\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"tagger\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"sentencizer\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"parser\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"ner\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"attribute_ruler\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"lemmatizer\\\\\\\\\\\\\\\"]}}\\\\\\\", \\\\\\\"preprocessed_summary:pegasus-newsroom\\\\\\\"]\\\": 6, \\\"[\\\\\\\"{\\\\\\\\\\\\\\\"_name\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"Spacy\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"_index\\\\\\\\\\\\\\\": null, \\\\\\\\\\\\\\\"_parameters\\\\\\\\\\\\\\\": {\\\\\\\\\\\\\\\"lang\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"en\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"pipeline\\\\\\\\\\\\\\\": [\\\\\\\\\\\\\\\"tok2vec\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"tagger\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"sentencizer\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"parser\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"ner\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"attribute_ruler\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"lemmatizer\\\\\\\\\\\\\\\"]}}\\\\\\\", \\\\\\\"preprocessed_summary:pegasus-xsum\\\\\\\"]\\\": 7, \\\"[\\\\\\\"{\\\\\\\\\\\\\\\"_name\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"Spacy\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"_index\\\\\\\\\\\\\\\": null, \\\\\\\\\\\\\\\"_parameters\\\\\\\\\\\\\\\": {\\\\\\\\\\\\\\\"lang\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"en\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"pipeline\\\\\\\\\\\\\\\": [\\\\\\\\\\\\\\\"tok2vec\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"tagger\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"sentencizer\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"parser\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"ner\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"attribute_ruler\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"lemmatizer\\\\\\\\\\\\\\\"]}}\\\\\\\", \\\\\\\"preprocessed_document\\\\\\\"]\\\": 6, \\\"[\\\\\\\"{\\\\\\\\\\\\\\\"_name\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"Spacy\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"_index\\\\\\\\\\\\\\\": null, \\\\\\\\\\\\\\\"_parameters\\\\\\\\\\\\\\\": {\\\\\\\\\\\\\\\"lang\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"en\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"pipeline\\\\\\\\\\\\\\\": [\\\\\\\\\\\\\\\"tok2vec\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"tagger\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"sentencizer\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"parser\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"ner\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"attribute_ruler\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"lemmatizer\\\\\\\\\\\\\\\"]}}\\\\\\\", \\\\\\\"preprocessed_summary:reference\\\\\\\"]\\\": 7}\", \"slicebuilders\": {\"subpopulation\": \"{}\", \"transformation\": \"{}\", \"attack\": \"{}\"}}", "_identifier": "{\"_name\": \"RGDataset\", \"_index\": null, \"_parameters\": {\"jsonl\": \"preloading/cnn_dailymail_1000.validation.predictions.jsonl\"}}", "_dataset_fmt": "in_memory"}

generation.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""
+Script for decoding summarization models available through Huggingface Transformers.
+Usage with Huggingface Datasets:
+python generation.py --model <model name> --data_path <path to data in jsonl format>
+Usage with custom datasets in JSONL format:
+python generation.py --model <model name> --dataset <dataset name> --split <data split>
+"""
+#!/usr/bin/env python
+# coding: utf-8
+import argparse
+import json
+import os
+import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from datasets import load_dataset
+from tqdm import tqdm
+BATCH_SIZE = 8
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+BART_CNNDM_CHECKPOINT = 'facebook/bart-large-cnn'
+BART_XSUM_CHECKPOINT = 'facebook/bart-large-xsum'
+PEGASUS_CNNDM_CHECKPOINT = 'google/pegasus-cnn_dailymail'
+PEGASUS_XSUM_CHECKPOINT = 'google/pegasus-xsum'
+PEGASUS_NEWSROOM_CHECKPOINT = 'google/pegasus-newsroom'
+PEGASUS_MULTINEWS_CHECKPOINT = 'google/pegasus-multi_news'
+MODEL_CHECKPOINTS = {
+    'bart-xsum': BART_XSUM_CHECKPOINT,
+    'bart-cnndm': BART_CNNDM_CHECKPOINT,
+    'pegasus-xsum': PEGASUS_XSUM_CHECKPOINT,
+    'pegasus-cnndm': PEGASUS_CNNDM_CHECKPOINT,
+    'pegasus-newsroom': PEGASUS_NEWSROOM_CHECKPOINT,
+    'pegasus-multinews': PEGASUS_MULTINEWS_CHECKPOINT
+}
+class JSONDataset(torch.utils.data.Dataset):
+    def __init__(self, data_path):
+        super(JSONDataset, self).__init__()
+        with open(data_path) as fd:
+            self.data = [json.loads(line) for line in fd]
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        return self.data[idx]
+def preprocess_data(raw_data, dataset):
+    """
+    Unify format of Huggingface Datastes
+    :param raw_data: loaded data
+    :param dataset: name of dataset
+    """
+    if dataset == 'xsum':
+        raw_data['article'] = raw_data['document']
+        raw_data['target'] = raw_data['summary']
+        del raw_data['document']
+        del raw_data['summary']
+    elif dataset == 'cnndm':
+        raw_data['target'] = raw_data['highlights']
+        del raw_data['highlights']
+    elif dataset == 'gigaword':
+        raw_data['article'] = raw_data['document']
+        raw_data['target'] = raw_data['summary']
+        del raw_data['document']
+        del raw_data['summary']
+    return raw_data
+def postprocess_data(raw_data, decoded):
+    """
+    Remove generation artifacts and postprocess outputs
+    :param raw_data: loaded data
+    :param decoded: model outputs
+    """
+    raw_data['target'] = [x.replace('\n', ' ') for x in raw_data['target']]
+    raw_data['decoded'] = [x.replace('<n>', ' ') for x in decoded]
+    return [dict(zip(raw_data, t)) for t in zip(*raw_data.values())]
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Process some integers.')
+    parser.add_argument('--model', type=str, required=True, choices=['bart-xsum', 'bart-cnndm', 'pegasus-xsum', 'pegasus-cnndm', 'pegasus-newsroom', 'pegasus-multinews'])
+    parser.add_argument('--data_path', type=str)
+    parser.add_argument('--dataset', type=str, choices=['xsum', 'cnndm', 'gigaword'])
+    parser.add_argument('--split', type=str, choices=['train', 'validation', 'test'])
+    args = parser.parse_args()
+    if args.dataset and not args.split:
+        raise RuntimeError('If `dataset` flag is specified `split` must also be provided.')
+    if args.data_path:
+        args.dataset = os.path.splitext(os.path.basename(args.data_path))[0]
+        args.split = 'user'
+    # Load models & data
+    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINTS[args.model]).to(DEVICE)
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINTS[args.model])
+    if not args.data_path:
+        if args.dataset == 'cnndm':
+            dataset = load_dataset('cnn_dailymail', '3.0.0', split=args.split)
+        elif args.dataset =='xsum':
+            dataset = load_dataset('xsum', split=args.split)
+        elif args.dataset =='gigaword':
+            dataset = load_dataset('gigaword', split=args.split)
+    else:
+        dataset = JSONDataset(args.data_path)
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE)
+    # Run validation
+    filename = '%s.%s.%s.results' % (args.model.replace("/", "-"), args.dataset, args.split)
+    fd_out = open(filename, 'w')
+    results = []
+    model.eval()
+    with torch.no_grad():
+        for raw_data in tqdm(dataloader):
+            raw_data = preprocess_data(raw_data, args.dataset)
+            batch = tokenizer(raw_data["article"], return_tensors="pt", truncation=True, padding="longest").to(DEVICE)
+            summaries = model.generate(input_ids=batch.input_ids, attention_mask=batch.attention_mask)
+            decoded = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+            result = postprocess_data(raw_data, decoded)
+            results.extend(result)
+            for example in result:
+                fd_out.write(json.dumps(example) + '\n')

preprocessing.py ADDED Viewed

	@@ -0,0 +1,701 @@

+import logging
+import os
+from argparse import ArgumentParser
+from ast import literal_eval
+from types import SimpleNamespace
+from typing import List
+from robustnessgym import Dataset, Spacy, CachedOperation
+from robustnessgym.core.constants import CACHEDOPS
+from robustnessgym.core.tools import strings_as_json
+from robustnessgym.logging.utils import set_logging_level
+from spacy import load
+from spacy.attrs import DEP, IS_ALPHA, IS_PUNCT, IS_STOP, LEMMA, LOWER, TAG, SENT_END, \
+    SENT_START, ORTH, POS, ENT_IOB
+from spacy.tokens import Doc
+from align import BertscoreAligner, NGramAligner, StaticEmbeddingAligner
+from utils import preprocess_text
+set_logging_level('critical')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.CRITICAL)
+def _spacy_encode(self, x):
+    arr = x.to_array(
+        [DEP, IS_ALPHA, IS_PUNCT, IS_STOP, LEMMA, LOWER, TAG, SENT_END, SENT_START,
+         ORTH, POS, ENT_IOB])
+    return {
+        'arr': arr.flatten(),
+        'shape': list(arr.shape),
+        'words': [t.text for t in x]
+    }
+def _spacy_decode(self, x):
+    doc = Doc(self.nlp.vocab, words=x['words'])
+    return doc.from_array(
+        [DEP, IS_ALPHA, IS_PUNCT, IS_STOP, LEMMA, LOWER,
+         TAG, SENT_END, SENT_START, ORTH, POS, ENT_IOB],
+        x['arr'].reshape(x['shape'])
+    )
+Spacy.encode = _spacy_encode
+Spacy.decode = _spacy_decode
+class AlignerCap(CachedOperation):
+    def __init__(
+            self,
+            aligner,
+            spacy,
+            **kwargs,
+    ):
+        super(AlignerCap, self).__init__(**kwargs)
+        self.spacy = spacy
+        self.aligner = aligner
+    @classmethod
+    def encode(cls, x):
+        # Convert to built-in types from np.int / np.float
+        return super(AlignerCap, cls).encode([
+            {str(k): [(int(t[0]), float(t[1])) for t in v] for k, v in d.items()}
+            for d in x
+        ])
+    @classmethod
+    def decode(cls, x):
+        x = super(AlignerCap, cls).decode(x)
+        x = [{literal_eval(k): v for k, v in d.items()} for d in x]
+        return x
+    def apply(self, batch, columns, *args, **kwargs):
+        # Run the aligner on the first example of the batch
+        return [
+            self.aligner.align(
+                self.spacy.retrieve(batch, columns[0])[0],
+                [self.spacy.retrieve(batch, col)[0] for col in columns[1:]]
+                if len(columns) > 2 else
+                [self.spacy.retrieve(batch, columns[1])[0]],
+            )
+        ]
+class BertscoreAlignerCap(AlignerCap):
+    def __init__(
+            self,
+            threshold: float,
+            top_k: int,
+            spacy,
+    ):
+        super(BertscoreAlignerCap, self).__init__(
+            aligner=BertscoreAligner(threshold=threshold, top_k=top_k),
+            spacy=spacy,
+            threshold=threshold,
+            top_k=top_k,
+        )
+class NGramAlignerCap(AlignerCap):
+    def __init__(
+            self,
+            spacy,
+    ):
+        super(NGramAlignerCap, self).__init__(
+            aligner=NGramAligner(),
+            spacy=spacy
+        )
+class StaticEmbeddingAlignerCap(AlignerCap):
+    def __init__(
+            self,
+            threshold: float,
+            top_k: int,
+            spacy,
+    ):
+        super(StaticEmbeddingAlignerCap, self).__init__(
+            aligner=StaticEmbeddingAligner(threshold=threshold, top_k=top_k),
+            spacy=spacy,
+            threshold=threshold,
+            top_k=top_k,
+        )
+def _run_aligners(
+        dataset: Dataset,
+        aligners: List[CachedOperation],
+        doc_column: str,
+        reference_column: str,
+        summary_columns: List[str] = None,
+):
+    if not summary_columns:
+        summary_columns = []
+    to_columns = []
+    if reference_column is not None:
+        to_columns.append(reference_column)
+    to_columns.extend(summary_columns)
+    for aligner in aligners:
+        # Run the aligner on (document, summary) pairs
+        dataset = aligner(
+            dataset,
+            [doc_column] + to_columns,
+            # Must use `batch_size = 1`
+            batch_size=1,
+        )
+        if reference_column is not None and len(summary_columns):
+            # Run the aligner on (reference, summary) pairs
+            dataset = aligner(
+                dataset,
+                [reference_column] + summary_columns,
+                # Must use `batch_size = 1`
+                batch_size=1,
+            )
+        if len(to_columns) > 1:
+            # Instead of having one column for (document, summary) comparisons, split
+            # off into (1 + |summary_columns|) total columns, one for each comparison
+            # Retrieve the (document, summary) column
+            doc_summary_column = aligner.retrieve(
+                dataset[:],
+                [doc_column] + to_columns,
+            )[tuple([doc_column] + to_columns)]
+            for i, col in enumerate(to_columns):
+                # Add as a new column after encoding with the aligner's `encode` method
+                dataset.add_column(
+                    column=str(aligner.identifier(columns=[doc_column, col])),
+                    values=[aligner.encode([row[i]]) for row in doc_summary_column],
+                )
+            # Remove the (document, summary) column
+            dataset.remove_column(
+                str(
+                    aligner.identifier(
+                        columns=[doc_column] + to_columns
+                    )
+                )
+            )
+            del dataset.interactions[CACHEDOPS].history[
+                (
+                    aligner.identifier,
+                    strings_as_json(
+                        strings=[doc_column] + to_columns
+                    )
+                )
+            ]
+        if reference_column is not None and len(summary_columns) > 1:
+            # Instead of having one column for (reference, summary) comparisons, split
+            # off into (|summary_columns|) total columns, one for each comparison
+            # Retrieve the (reference, summary) column
+            reference_summary_column = aligner.retrieve(
+                dataset[:],
+                [reference_column] + summary_columns,
+            )[tuple([reference_column] + summary_columns)]
+            for i, col in enumerate(summary_columns):
+                # Add as a new column
+                dataset.add_column(
+                    column=str(aligner.identifier(columns=[reference_column, col])),
+                    values=[
+                        aligner.encode([row[i]]) for row in reference_summary_column
+                    ]
+                )
+            # Remove the (reference, summary) column
+            dataset.remove_column(
+                str(
+                    aligner.identifier(
+                        columns=[reference_column] + summary_columns
+                    )
+                )
+            )
+            del dataset.interactions[CACHEDOPS].history[
+                (
+                    aligner.identifier,
+                    strings_as_json(
+                        strings=[reference_column] + summary_columns
+                    )
+                )
+            ]
+    return dataset
+def deanonymize_dataset(
+        rg_path: str,
+        standardized_dataset: Dataset,
+        processed_dataset_path: str = None,
+        n_samples: int = None,
+):
+    """Take an anonymized dataset and add back the original dataset columns."""
+    assert processed_dataset_path is not None, \
+        "Please specify a path to save the dataset."
+    # Load the dataset
+    dataset = Dataset.load_from_disk(rg_path)
+    if n_samples:
+        dataset.set_visible_rows(list(range(n_samples)))
+        standardized_dataset.set_visible_rows(list(range(n_samples)))
+    text_columns = []
+    # Add columns from the standardized dataset
+    dataset.add_column('document', standardized_dataset['document'])
+    text_columns.append('document')
+    if 'summary:reference' in standardized_dataset.column_names:
+        dataset.add_column('summary:reference', standardized_dataset['summary:reference'])
+        text_columns.append('summary:reference')
+    # Preprocessing all the text columns
+    dataset = dataset.update(
+        lambda x: {f'preprocessed_{k}': preprocess_text(x[k]) for k in text_columns}
+    )
+    # Run the Spacy pipeline on all preprocessed text columns
+    try:
+        nlp = load('en_core_web_lg')
+    except OSError:
+        nlp = load('en_core_web_sm')
+    nlp.add_pipe('sentencizer', before="parser")
+    spacy = Spacy(nlp=nlp)
+    dataset = spacy(
+        dataset,
+        [f'preprocessed_{col}' for col in text_columns],
+        batch_size=100,
+    )
+    # Directly save to disk
+    dataset.save_to_disk(processed_dataset_path)
+    return dataset
+def run_workflow(
+        jsonl_path: str = None,
+        dataset: Dataset = None,
+        doc_column: str = None,
+        reference_column: str = None,
+        summary_columns: List[str] = None,
+        bert_aligner_threshold: float = 0.5,
+        bert_aligner_top_k: int = 3,
+        embedding_aligner_threshold: float = 0.5,
+        embedding_aligner_top_k: int = 3,
+        processed_dataset_path: str = None,
+        n_samples: int = None,
+        anonymize: bool = False,
+):
+    assert (jsonl_path is None) != (dataset is None), \
+        "One of `jsonl_path` and `dataset` must be specified."
+    assert processed_dataset_path is not None, \
+        "Please specify a path to save the dataset."
+    # Load the dataset
+    if jsonl_path is not None:
+        dataset = Dataset.from_jsonl(jsonl_path)
+    if doc_column is None:
+        # Assume `doc_column` is called "document"
+        doc_column = 'document'
+        assert doc_column in dataset.column_names, \
+            f"`doc_column={doc_column}` is not a column in dataset."
+        print("Assuming `doc_column` is called 'document'.")
+    if reference_column is None:
+        # Assume `reference_column` is called "summary:reference"
+        reference_column = 'summary:reference'
+        print("Assuming `reference_column` is called 'summary:reference'.")
+        if reference_column not in dataset.column_names:
+            print("No reference summary loaded")
+            reference_column = None
+    if summary_columns is None or len(summary_columns) == 0:
+        # Assume `summary_columns` are prefixed by "summary:"
+        summary_columns = []
+        for col in dataset.column_names:
+            if col.startswith("summary:") and col != "summary:reference":
+                summary_columns.append(col)
+        print(f"Reading summary columns from dataset. Found {summary_columns}.")
+    if len(summary_columns) == 0 and reference_column is None:
+        raise ValueError("At least one summary is required")
+    # Set visible rows to restrict to the first `n_samples`
+    if n_samples:
+        dataset.set_visible_rows(list(range(n_samples)))
+    # Combine the text columns into one list
+    text_columns = [doc_column] + ([reference_column] if reference_column else []) + summary_columns
+    # Preprocessing all the text columns
+    dataset = dataset.update(
+        lambda x: {f'preprocessed_{k}': preprocess_text(x[k]) for k in text_columns}
+    )
+    # Run the Spacy pipeline on all preprocessed text columns
+    nlp = load('en_core_web_lg')
+    nlp.add_pipe('sentencizer', before="parser")
+    spacy = Spacy(nlp=nlp)
+    dataset = spacy(
+        dataset,
+        [f'preprocessed_{col}' for col in text_columns],
+        batch_size=100,
+    )
+    # Run the 3 align pipelines
+    bert_aligner = BertscoreAlignerCap(
+        threshold=bert_aligner_threshold,
+        top_k=bert_aligner_top_k,
+        spacy=spacy,
+    )
+    embedding_aligner = StaticEmbeddingAlignerCap(
+        threshold=embedding_aligner_threshold,
+        top_k=embedding_aligner_top_k,
+        spacy=spacy,
+    )
+    ngram_aligner = NGramAlignerCap(
+        spacy=spacy,
+    )
+    dataset = _run_aligners(
+        dataset=dataset,
+        aligners=[bert_aligner, embedding_aligner, ngram_aligner],
+        doc_column=f'preprocessed_{doc_column}',
+        reference_column=f'preprocessed_{reference_column}' if reference_column else None,
+        summary_columns=[f'preprocessed_{col}' for col in summary_columns],
+    )
+    # Save the dataset
+    if anonymize:
+        # Remove certain columns to anonymize and save to disk
+        for col in [doc_column, reference_column]:
+            if col is not None:
+                dataset.remove_column(col)
+                dataset.remove_column(f'preprocessed_{col}')
+                dataset.remove_column(
+                    str(spacy.identifier(columns=[f'preprocessed_{col}']))
+                )
+                del dataset.interactions[CACHEDOPS].history[
+                    (spacy.identifier, f'preprocessed_{col}')
+                ]
+        dataset.save_to_disk(f'{processed_dataset_path}.anonymized')
+    else:
+        # Directly save to disk
+        dataset.save_to_disk(processed_dataset_path)
+    return dataset
+def parse_prediction_jsonl_name(prediction_jsonl: str):
+    """Parse the name of the prediction_jsonl to extract useful information."""
+    # Analyze the name of the prediction_jsonl
+    filename = prediction_jsonl.split("/")[-1]
+    # Check that the filename ends with `.results.anonymized`
+    if filename.endswith(".results.anonymized"):
+        # Fmt: <model>-<training dataset>.<eval dataset>.<eval split>.results.anonymized
+        # Split using a period
+        model_train_dataset, eval_dataset, eval_split = filename.split(".")[:-2]
+        model, train_dataset = model_train_dataset.split("-")
+        return SimpleNamespace(
+            model_train_dataset=model_train_dataset,
+            model=model,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            eval_split=eval_split,
+        )
+    raise NotImplementedError(
+        "Prediction files must be named "
+        "<model>-<training dataset>.<eval dataset>.<eval split>.results.anonymized. "
+        f"Please rename the prediction file {filename} and run again."
+    )
+def join_predictions(
+        dataset_jsonl: str = None,
+        prediction_jsonls: str = None,
+        save_jsonl_path: str = None,
+):
+    """Join predictions with a dataset."""
+    assert prediction_jsonls is not None, "Must have prediction jsonl files."
+    print(
+        "> Warning: please inspect the prediction .jsonl file to make sure that "
+        "predictions are aligned with the examples in the dataset. "
+        "Use `get_dataset` to inspect the dataset."
+    )
+    # Load the dataset
+    dataset = get_dataset(dataset_jsonl=dataset_jsonl)
+    # Parse names of all prediction files to get metadata
+    metadata = [
+        parse_prediction_jsonl_name(prediction_jsonl)
+        for prediction_jsonl in prediction_jsonls
+    ]
+    # Load the predictions
+    predictions = [
+        Dataset.from_jsonl(json_path=prediction_jsonl)
+        for prediction_jsonl in prediction_jsonls
+    ]
+    # Predictions for a model
+    for i, prediction_data in enumerate(predictions):
+        # Get metadata for i_th prediction file
+        metadata_i = metadata[i]
+        # Construct a prefix for columns added to the dataset for this prediction file
+        prefix = metadata_i.model_train_dataset
+        # Add the predictions column to the dataset
+        for col in prediction_data.column_names:
+            # Don't add the indexing information since the dataset has it already
+            if col not in {'index', 'ix', 'id'}:
+                # `add_column` will automatically ensure that column lengths match
+                if col == 'decoded':  # rename decoded to summary
+                    dataset.add_column(f'summary:{prefix}', prediction_data[col])
+                else:
+                    dataset.add_column(f'{prefix}:{col}', prediction_data[col])
+    # Save the dataset back to disk
+    if save_jsonl_path:
+        dataset.to_jsonl(save_jsonl_path)
+    else:
+        print("Dataset with predictions was not saved since `save_jsonl_path` "
+              "was not specified.")
+    return dataset
+def standardize_dataset(
+        dataset_name: str = None,
+        dataset_version: str = None,
+        dataset_split: str = 'test',
+        dataset_jsonl: str = None,
+        doc_column: str = None,
+        reference_column: str = None,
+        save_jsonl_path: str = None,
+        no_save: bool = False,
+):
+    """Load a dataset from Huggingface and dump it to disk."""
+    # Load the dataset from Huggingface
+    dataset = get_dataset(
+        dataset_name=dataset_name,
+        dataset_version=dataset_version,
+        dataset_split=dataset_split,
+        dataset_jsonl=dataset_jsonl,
+    )
+    if doc_column is None:
+        if reference_column is not None:
+            raise ValueError("You must specify `doc_column` if you specify `reference_column`")
+        try:
+            doc_column, reference_column = {
+                'cnn_dailymail': ('article', 'highlights'),
+                'xsum': ('document', 'summary')
+            }[dataset_name]
+        except:
+            raise NotImplementedError(
+                "Please specify `doc_column`."
+            )
+    # Rename the columns
+    if doc_column != 'document':
+        dataset.add_column('document', dataset[doc_column])
+        dataset.remove_column(doc_column)
+    dataset.add_column('summary:reference', dataset[reference_column])
+    dataset.remove_column(reference_column)
+    # Save the dataset back to disk
+    if save_jsonl_path:
+        dataset.to_jsonl(save_jsonl_path)
+    elif (save_jsonl_path is None) and not no_save:
+        # Auto-create a path to save the standardized dataset
+        os.makedirs('preprocessing', exist_ok=True)
+        if not dataset_jsonl:
+            dataset.to_jsonl(
+                f'preprocessing/'
+                f'standardized_{dataset_name}_{dataset_version}_{dataset_split}.jsonl'
+            )
+        else:
+            dataset.to_jsonl(
+                f'preprocessing/'
+                f'standardized_{dataset_jsonl.split("/")[-1]}'
+            )
+    return dataset
+def get_dataset(
+        dataset_name: str = None,
+        dataset_version: str = None,
+        dataset_split: str = 'test',
+        dataset_jsonl: str = None,
+):
+    """Load a dataset."""
+    assert (dataset_name is not None) != (dataset_jsonl is not None), \
+        "Specify one of `dataset_name` or `dataset_jsonl`."
+    # Load the dataset
+    if dataset_name is not None:
+        return get_hf_dataset(dataset_name, dataset_version, dataset_split)
+    return Dataset.from_jsonl(json_path=dataset_jsonl)
+def get_hf_dataset(name: str, version: str = None, split: str = 'test'):
+    """Get dataset from Huggingface."""
+    if version:
+        return Dataset.load_dataset(name, version, split=split)
+    return Dataset.load_dataset(name, split=split)
+if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument('--dataset', type=str, choices=['cnn_dailymail', 'xsum'],
+                        help="Huggingface dataset name.")
+    parser.add_argument('--version', type=str,
+                        help="Huggingface dataset version.")
+    parser.add_argument('--split', type=str, default='test',
+                        help="Huggingface dataset split.")
+    parser.add_argument('--dataset_jsonl', type=str,
+                        help="Path to a jsonl file for the dataset.")
+    parser.add_argument('--dataset_rg', type=str,
+                        help="Path to a dataset stored in the Robustness Gym format. "
+                             "All processed datasets are stored in this format.")
+    parser.add_argument('--prediction_jsonls', nargs='+', default=[],
+                        help="Path to one or more jsonl files for the predictions.")
+    parser.add_argument('--save_jsonl_path', type=str,
+                        help="Path to save the processed jsonl dataset.")
+    parser.add_argument('--doc_column', type=str,
+                        help="Name of the document column in the dataset.")
+    parser.add_argument('--reference_column', type=str,
+                        help="Name of the reference summary column in the dataset.")
+    parser.add_argument('--summary_columns', nargs='+', default=[],
+                        help="Name of other summary columns in/added to the dataset.")
+    parser.add_argument('--bert_aligner_threshold', type=float, default=0.1,
+                        help="Minimum threshold for BERT alignment.")
+    parser.add_argument('--bert_aligner_top_k', type=int, default=10,
+                        help="Top-k for BERT alignment.")
+    parser.add_argument('--embedding_aligner_threshold', type=float, default=0.1,
+                        help="Minimum threshold for embedding alignment.")
+    parser.add_argument('--embedding_aligner_top_k', type=int, default=10,
+                        help="Top-k for embedding alignment.")
+    parser.add_argument('--processed_dataset_path', type=str,
+                        help="Path to store the final processed dataset.")
+    parser.add_argument('--n_samples', type=int,
+                        help="Number of dataset samples to process.")
+    parser.add_argument('--workflow', action='store_true', default=False,
+                        help="Whether to run the preprocessing workflow.")
+    parser.add_argument('--standardize', action='store_true', default=False,
+                        help="Whether to standardize the dataset and save to jsonl.")
+    parser.add_argument('--join_predictions', action='store_true', default=False,
+                        help="Whether to add predictions to the dataset and save to "
+                             "jsonl.")
+    parser.add_argument('--try_it', action='store_true', default=False,
+                        help="`Try it` mode is faster and runs processing on 10 "
+                             "examples.")
+    parser.add_argument('--deanonymize', action='store_true', default=False,
+                        help="Deanonymize the dataset provided by summvis.")
+    parser.add_argument('--anonymize', action='store_true', default=False,
+                        help="Anonymize by removing document and reference summary "
+                             "columns of the original dataset.")
+    args = parser.parse_args()
+    if args.standardize:
+        # Dump a dataset to jsonl on disk after standardizing it
+        standardize_dataset(
+            dataset_name=args.dataset,
+            dataset_version=args.version,
+            dataset_split=args.split,
+            dataset_jsonl=args.dataset_jsonl,
+            doc_column=args.doc_column,
+            reference_column=args.reference_column,
+            save_jsonl_path=args.save_jsonl_path,
+        )
+    if args.join_predictions:
+        # Join the predictions with the dataset
+        dataset = join_predictions(
+            dataset_jsonl=args.dataset_jsonl,
+            prediction_jsonls=args.prediction_jsonls,
+            save_jsonl_path=args.save_jsonl_path,
+        )
+    if args.workflow:
+        # Run the processing workflow
+        dataset = None
+        # Check if `args.dataset_rg` was passed in
+        if args.dataset_rg:
+            # Load the dataset directly
+            dataset = Dataset.load_from_disk(args.dataset_rg)
+        run_workflow(
+            jsonl_path=args.dataset_jsonl,
+            dataset=dataset,
+            doc_column=args.doc_column,
+            reference_column=args.reference_column,
+            summary_columns=args.summary_columns,
+            bert_aligner_threshold=args.bert_aligner_threshold,
+            bert_aligner_top_k=args.bert_aligner_top_k,
+            embedding_aligner_threshold=args.embedding_aligner_threshold,
+            embedding_aligner_top_k=args.embedding_aligner_top_k,
+            processed_dataset_path=args.processed_dataset_path,
+            n_samples=args.n_samples if not args.try_it else 10,
+            anonymize=args.anonymize,
+        )
+    if args.deanonymize:
+        # Deanonymize an anonymized dataset
+        # Check if `args.dataset_rg` was passed in
+        assert args.dataset_rg is not None, \
+            "Must specify `dataset_rg` path to be deanonymized."
+        assert args.dataset_rg.endswith('anonymized'), \
+            "`dataset_rg` must end in 'anonymized'."
+        assert (args.dataset is None) != (args.dataset_jsonl is None), \
+            "`dataset_rg` points to an anonymized dataset that will be " \
+            "deanonymized. Please pass in relevant arguments: either " \
+            "`dataset`, `version` and `split` OR `dataset_jsonl`."
+        # Load the standardized dataset
+        standardized_dataset = standardize_dataset(
+            dataset_name=args.dataset,
+            dataset_version=args.version,
+            dataset_split=args.split,
+            dataset_jsonl=args.dataset_jsonl,
+            doc_column=args.doc_column,
+            reference_column=args.reference_column,
+            no_save=True,
+        )
+        # Use it to deanonymize
+        dataset = deanonymize_dataset(
+            rg_path=args.dataset_rg,
+            standardized_dataset=standardized_dataset,
+            processed_dataset_path=args.processed_dataset_path,
+            n_samples=args.n_samples if not args.try_it else 10,
+        )

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+spacy==3.0.3
+streamlit==0.77.0
+st-annotated-text==1.1.0
+transformers==4.2.2
+torch==1.7.1
+bert-score==0.3.7
+rouge-score==0.0.4
+toolz==0.11.1
+nltk==3.4.5
+robustnessgym==0.0.4a0
+sentencepiece==0.1.95

resources/jquery.color-2.1.2.min.js ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ /! jQuery Color v@2.1.2 http://github.com/jquery/jquery-color \| jquery.org/license /
2	+ (function(a,b){function m(a,b,c){var d=h[b.type]\|\|{};return a==null?c\|\|!b.def?null:b.def:(a=d.floor?~~a:parseFloat(a),isNaN(a)?b.def:d.mod?(a+d.mod)%d.mod:0>a?0:d.max<a?d.max:a)}function n(b){var c=f(),d=c._rgba=[];return b=b.toLowerCase(),l(e,function(a,e){var f,h=e.re.exec(b),i=h&&e.parse(h),j=e.space\|\|"rgba";if(i)return f=c[j](i),c[g[j].cache]=f[g[j].cache],d=c._rgba=f._rgba,!1}),d.length?(d.join()==="0,0,0,0"&&a.extend(d,k.transparent),c):k[b]}function o(a,b,c){return c=(c+1)%1,c6<1?a+(b-a)c6:c2<1?b:c3<2?a+(b-a)(2/3-c)6:a}var c="backgroundColor borderBottomColor borderLeftColor borderRightColor borderTopColor color columnRuleColor outlineColor textDecorationColor textEmphasisColor",d=/^([\-+])=\s(\d+\.?\d)/,e=[{re:/rgba?\(\s(\d{1,3})\s,\s(\d{1,3})\s,\s(\d{1,3})\s(?:,\s(\d?(?:\.\d+)?)\s)?\)/,parse:function(a){return[a[1],a[2],a[3],a[4]]}},{re:/rgba?\(\s(\d+(?:\.\d+)?)\%\s,\s(\d+(?:\.\d+)?)\%\s,\s(\d+(?:\.\d+)?)\%\s(?:,\s(\d?(?:\.\d+)?)\s)?\)/,parse:function(a){return[a[1]2.55,a[2]2.55,a[3]2.55,a[4]]}},{re:/#([a-f0-9]{2})([a-f0-9]{2})([a-f0-9]{2})/,parse:function(a){return[parseInt(a[1],16),parseInt(a[2],16),parseInt(a[3],16)]}},{re:/#([a-f0-9])([a-f0-9])([a-f0-9])/,parse:function(a){return[parseInt(a[1]+a[1],16),parseInt(a[2]+a[2],16),parseInt(a[3]+a[3],16)]}},{re:/hsla?\(\s(\d+(?:\.\d+)?)\s,\s(\d+(?:\.\d+)?)\%\s,\s(\d+(?:\.\d+)?)\%\s(?:,\s(\d?(?:\.\d+)?)\s)?\)/,space:"hsla",parse:function(a){return[a[1],a[2]/100,a[3]/100,a[4]]}}],f=a.Color=function(b,c,d,e){return new a.Color.fn.parse(b,c,d,e)},g={rgba:{props:{red:{idx:0,type:"byte"},green:{idx:1,type:"byte"},blue:{idx:2,type:"byte"}}},hsla:{props:{hue:{idx:0,type:"degrees"},saturation:{idx:1,type:"percent"},lightness:{idx:2,type:"percent"}}}},h={"byte":{floor:!0,max:255},percent:{max:1},degrees:{mod:360,floor:!0}},i=f.support={},j=a("<p>")[0],k,l=a.each;j.style.cssText="background-color:rgba(1,1,1,.5)",i.rgba=j.style.backgroundColor.indexOf("rgba")>-1,l(g,function(a,b){b.cache="_"+a,b.props.alpha={idx:3,type:"percent",def:1}}),f.fn=a.extend(f.prototype,{parse:function(c,d,e,h){if(c===b)return this._rgba=[null,null,null,null],this;if(c.jquery\|\|c.nodeType)c=a(c).css(d),d=b;var i=this,j=a.type(c),o=this._rgba=[];d!==b&&(c=[c,d,e,h],j="array");if(j==="string")return this.parse(n(c)\|\|k._default);if(j==="array")return l(g.rgba.props,function(a,b){o[b.idx]=m(c[b.idx],b)}),this;if(j==="object")return c instanceof f?l(g,function(a,b){c[b.cache]&&(i[b.cache]=c[b.cache].slice())}):l(g,function(b,d){var e=d.cache;l(d.props,function(a,b){if(!i[e]&&d.to){if(a==="alpha"\|\|c[a]==null)return;i[e]=d.to(i._rgba)}i[e][b.idx]=m(c[a],b,!0)}),i[e]&&a.inArray(null,i[e].slice(0,3))<0&&(i[e][3]=1,d.from&&(i._rgba=d.from(i[e])))}),this},is:function(a){var b=f(a),c=!0,d=this;return l(g,function(a,e){var f,g=b[e.cache];return g&&(f=d[e.cache]\|\|e.to&&e.to(d._rgba)\|\|[],l(e.props,function(a,b){if(g[b.idx]!=null)return c=g[b.idx]===f[b.idx],c})),c}),c},_space:function(){var a=[],b=this;return l(g,function(c,d){b[d.cache]&&a.push(c)}),a.pop()},transition:function(a,b){var c=f(a),d=c._space(),e=g[d],i=this.alpha()===0?f("transparent"):this,j=i[e.cache]\|\|e.to(i._rgba),k=j.slice();return c=c[e.cache],l(e.props,function(a,d){var e=d.idx,f=j[e],g=c[e],i=h[d.type]\|\|{};if(g===null)return;f===null?k[e]=g:(i.mod&&(g-f>i.mod/2?f+=i.mod:f-g>i.mod/2&&(f-=i.mod)),k[e]=m((g-f)b+f,d))}),this[d](k)},blend:function(b){if(this._rgba[3]===1)return this;var c=this._rgba.slice(),d=c.pop(),e=f(b)._rgba;return f(a.map(c,function(a,b){return(1-d)e[b]+da}))},toRgbaString:function(){var b="rgba(",c=a.map(this._rgba,function(a,b){return a==null?b>2?1:0:a});return c[3]===1&&(c.pop(),b="rgb("),b+c.join()+")"},toHslaString:function(){var b="hsla(",c=a.map(this.hsla(),function(a,b){return a==null&&(a=b>2?1:0),b&&b<3&&(a=Math.round(a100)+"%"),a});return c[3]===1&&(c.pop(),b="hsl("),b+c.join()+")"},toHexString:function(b){var c=this._rgba.slice(),d=c.pop();return b&&c.push(~~(d255)),"#"+a.map(c,function(a){return a=(a\|\|0).toString(16),a.length===1?"0"+a:a}).join("")},toString:function(){return this._rgba[3]===0?"transparent":this.toRgbaString()}}),f.fn.parse.prototype=f.fn,g.hsla.to=function(a){if(a[0]==null\|\|a[1]==null\|\|a[2]==null)return[null,null,null,a[3]];var b=a[0]/255,c=a[1]/255,d=a[2]/255,e=a[3],f=Math.max(b,c,d),g=Math.min(b,c,d),h=f-g,i=f+g,j=i.5,k,l;return g===f?k=0:b===f?k=60(c-d)/h+360:c===f?k=60(d-b)/h+120:k=60(b-c)/h+240,h===0?l=0:j<=.5?l=h/i:l=h/(2-i),[Math.round(k)%360,l,j,e==null?1:e]},g.hsla.from=function(a){if(a[0]==null\|\|a[1]==null\|\|a[2]==null)return[null,null,null,a[3]];var b=a[0]/360,c=a[1],d=a[2],e=a[3],f=d<=.5?d(1+c):d+c-dc,g=2d-f;return[Math.round(o(g,f,b+1/3)255),Math.round(o(g,f,b)255),Math.round(o(g,f,b-1/3)255),e]},l(g,function(c,e){var g=e.props,h=e.cache,i=e.to,j=e.from;f.fn[c]=function(c){i&&!this[h]&&(this[h]=i(this._rgba));if(c===b)return this[h].slice();var d,e=a.type(c),k=e==="array"\|\|e==="object"?c:arguments,n=this[h].slice();return l(g,function(a,b){var c=k[e==="object"?a:b.idx];c==null&&(c=n[b.idx]),n[b.idx]=m(c,b)}),j?(d=f(j(n)),d[h]=n,d):f(n)},l(g,function(b,e){if(f.fn[b])return;f.fn[b]=function(f){var g=a.type(f),h=b==="alpha"?this._hsla?"hsla":"rgba":c,i=this[h](),j=i[e.idx],k;return g==="undefined"?j:(g==="function"&&(f=f.call(this,j),g=a.type(f)),f==null&&e.empty?this:(g==="string"&&(k=d.exec(f),k&&(f=j+parseFloat(k[2])(k[1]==="+"?1:-1))),i[e.idx]=f,this[h](i)))}})}),f.hook=function(b){var c=b.split(" ");l(c,function(b,c){a.cssHooks[c]={set:function(b,d){var e,g,h="";if(d!=="transparent"&&(a.type(d)!=="string"\|\|(e=n(d)))){d=f(e\|\|d);if(!i.rgba&&d._rgba[3]!==1){g=c==="backgroundColor"?b.parentNode:b;while((h===""\|\|h==="transparent")&&g&&g.style)try{h=a.css(g,"backgroundColor"),g=g.parentNode}catch(j){}d=d.blend(h&&h!=="transparent"?h:"_default")}d=d.toRgbaString()}try{b.style[c]=d}catch(j){}}},a.fx.step[c]=function(b){b.colorInit\|\|(b.start=f(b.elem,c),b.end=f(b.end),b.colorInit=!0),a.cssHooks[c].set(b.elem,b.start.transition(b.end,b.pos))}})},f.hook(c),a.cssHooks.borderColor={expand:function(a){var b={};return l(["Top","Right","Bottom","Left"],function(c,d){b["border"+d+"Color"]=a}),b}},k=a.Color.names={aqua:"#00ffff",black:"#000000",blue:"#0000ff",fuchsia:"#ff00ff",gray:"#808080",green:"#008000",lime:"#00ff00",maroon:"#800000",navy:"#000080",olive:"#808000",purple:"#800080",red:"#ff0000",silver:"#c0c0c0",teal:"#008080",white:"#ffffff",yellow:"#ffff00",transparent:[null,null,null,0],_default:"#ffffff"}})(jQuery);

resources/summvis.css ADDED Viewed

	@@ -0,0 +1,347 @@

+body {
+    font-family: 'Roboto', sans-serif;
+    font-weight: 400;
+    line-height: 1.5;
+    color: #262730;
+    font-weight: 400;
+}
+.vis-container {
+    height: 670px;
+    background-color: #F5F7F9;
+}
+.nodisplay {
+    display: none !important;
+}
+.scroll {
+    overflow-y: scroll;
+}
+.doc-container {
+    padding: 10px 20px;
+}
+.horizontal-layout .doc-container {
+    padding-bottom: 0px;
+}
+.vertical-layout .doc-container {
+    float: left;
+    width: 50%;
+    padding-right: 0px;
+}
+.summary-container {
+    padding: 0px 20px;
+}
+.vertical-layout .summary-container {
+    float: left;
+    width: 50%;
+    padding-top: 8px;
+}
+.vertical-layout .main-doc.scroll {
+    height: 610px;
+}
+.main-doc.scroll {
+    scrollbar-width: none;
+}
+/* Works on Chrome, Edge, and Safari */
+.main-doc.scroll::-webkit-scrollbar {
+    width: 0;
+}
+.vertical-layout .proxy-doc {
+    height: 610px;
+}
+.vertical-layout .summary-list.scroll {
+    height: 610px;
+}
+.horizontal-layout .scroll {
+    height: 270px;
+}
+.doc {
+    display: flex;
+}
+.horizontal-layout .doc {
+}
+.main-doc {
+    background-color: white;
+    padding-left: 17px;
+    padding-right: 15px;
+    padding-top: 16px;
+    border-top-left-radius: 4px;
+    border-bottom-left-radius: 4px;
+    flex: 1;
+    border: 1px solid #e9e9e9;
+}
+.display .proxy-scroll {
+    position: absolute;
+    left: 9px;
+    width: 9px;
+    border-radius: 6px;
+    background-color: rgba(0, 0, 0, 0.1);
+}
+.display .proxy-scroll.hover {
+    background-color: rgba(0, 0, 0, 0.2);
+}
+.proxy-doc {
+    flex: 0 0 28px;
+    background-color: white;
+    position: relative;
+    border-bottom-right-radius: 4px;
+    border-top-right-radius: 4px;
+    padding-left: 3px;
+    padding-right: 3px;
+    border-top: 1px solid #e9e9e9;
+    border-right: 1px solid #e9e9e9;
+    border-bottom: 1px solid #e9e9e9;
+}
+.vertical-layout .proxy-doc {
+    margin-right: 25px;
+}
+.summary-list {
+    border-top: 1px solid #ccc;
+    border-bottom: 1px solid #ccc;
+    border-radius: 4px;
+}
+.summary-item {
+    border-bottom: 1px solid #ccc;
+    border-left: 1px solid #ccc;
+    border-right: 1px solid #ccc;
+    background-color: white;
+    padding-top: 16px;
+    padding-bottom: 16px;
+    padding-left: 23px;
+    padding-right: 8px;
+}
+.summary-item:last-child {
+    border-bottom: 0px;
+    border-bottom-left-radius: 3px;
+}
+.summary-item.selected.selectable {
+    border-left: 3px solid #2377E9;
+    padding-left: 21px;
+}
+.summary-item.selectable:not(.selected):hover {
+    cursor: pointer;
+    background-color: #FCFDFF;
+}
+.summary-item.selected.selectable .highlight:not(.annotation-hidden):hover {
+    cursor: pointer;
+}
+.summary-item.selected.selectable .underline:not(.annotation-hidden):hover {
+    cursor: pointer;
+}
+.summary-item .name {
+    margin-bottom: 8px;
+    font-weight: 400;
+}
+.summary-item.selected.selectable .name {
+    font-weight: 500;
+}
+.inactive {
+    opacity: 0.5 !important;
+}
+.stopword.grayed-out {
+    opacity: 50%
+}
+.has-lexical-alignment .annotate-novel {
+    /* Bold all non-underlined items */
+    font-weight: 500;
+    color: black;
+}
+.summary-item .stopword {
+    font-weight: 400;
+}
+.summary-item .token-underline {
+    font-weight: 400;
+}
+.summary-item:not(.selected) .underline, .summary-item:not(.selectable) .underline {
+    border-color: #909090 !important;
+}
+.underline.annotation-inactive {
+    border-color: #E9E9E9 !important;
+}
+.underline.annotation-invisible {
+    border-color: transparent !important;
+}
+.underline.annotation-hidden {
+    border: 0px !important;
+    margin: 0px !important;
+}
+.proxy-underline.annotation-hidden, .proxy-highlight.annotation-hidden {
+    visibility: hidden;
+}
+.proxy-underline.annotation-inactive {
+    background-color: #E9E9E9 !important;
+}
+.proxy-underline.annotation-invisible {
+    background-color: transparent !important;
+}
+.highlight {
+    display: inline-block;
+}
+.highlight.annotation-hidden {
+    background: none !important;
+    border-color: transparent !important;
+    border-bottom: 0px !important;
+}
+.highlight.annotation-invisible {
+    background-color: transparent !important;
+    border-color: transparent !important;
+}
+.summary-item:not(.selected) .highlight:not(.annotation-hidden),
+.summary-item:not(.selectable) .highlight:not(.annotation-hidden) {
+    border-color: #909090 !important;
+}
+.highlight.annotation-inactive {
+    border-color: #E9E9E9 !important;
+}
+.display .proxy-scroll.hidden {
+    visibility: hidden;
+}
+#document-header {
+    min-height: 35px;
+    margin-bottom: 0px;
+    align-items: center;
+    color: black;
+    display: flex;
+}
+#summary-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    min-height: 35px;
+    margin-bottom: 0px;
+    color: black;
+}
+.horizontal-layout #summary-header {
+    margin-top: 23px;
+}
+#summary-header-gap {
+    flex: 1 0 15px;
+}
+.highlight.selected {
+    border-color: transparent !important;
+}
+.highlight:not(.selected), .proxy-highlight:not(.selected) {
+    background-color: transparent !important;
+}
+.summary-item.annotate-entities .entity:not(.matches-ngram) {
+    color: #fb425c;
+    font-weight: 500;
+}
+.summary-item.annotate-lexical .highlight.matches-ngram {
+    padding: 0px;
+    border-bottom: 0px !important;
+}
+.doc .highlight {
+    padding: 0px;
+    border: 0px !important;
+}
+ul.annotation-key {
+    display: flex;
+    align-items: flex-end;
+    list-style: none;
+    justify-content: flex-start;
+    padding: 0px;
+    margin: 0px 0px 10px 0px;
+}
+.annotation-key li {
+    margin-right: 15px;
+    font-size: 13px;
+    padding: 6px 13px 6px 13px;
+}
+.annotation-key li.option {
+    border-radius: 13px;
+    cursor: pointer;
+    border: 1px solid #F3F3F3;
+}
+.annotation-key li.option.selected {
+    background-color: #F0F2F6;
+}
+.annotation-key-label {
+    margin: 0px;
+    padding-left: 0px !important;
+    padding-right: 0px !important;
+}
+.annotation-key-ngram {
+    border-bottom: 3px solid #66c2a5;
+    padding-bottom: 1px;
+}
+.annotation-key-semantic {
+    border-bottom: 4px dotted #66c2a5;
+    padding-bottom: 1px;
+}
+.annotation-key-novel {
+    font-weight: 500;
+    color: black;
+}
+.annotation-key-entity {
+    font-weight: 500;
+    color: #fb425c;
+}
+.annotation-key-stopword {
+    opacity: 70%;
+}

resources/summvis.js ADDED Viewed

	@@ -0,0 +1,518 @@

+$(document).ready(
+    function () {
+        // Define global variables
+        let isDragging = false;
+        let saveDragPos;
+        let rtime;
+        let timeout = false;
+        let delta = 200;
+        let disableScrollEvent = false;
+        let annotateLexical = false;
+        let annotateSemantic = false;
+        let annotateNovel = false;
+        let annotateEntities = false;
+        // Define functions
+        function clamp(number, min, max) {
+            return Math.max(min, Math.min(number, max));
+        }
+        function hasScroll() {
+            const el = $(".display .main-doc");
+            return el.prop("scrollHeight") > el.prop("clientHeight");
+        }
+        function scrollBy(delta) {
+            const proxyDoc = $(".display .proxy-doc");
+            const proxyScroll = proxyDoc.find(".proxy-scroll");
+            const currentTop = parseFloat(proxyScroll.css("top"));
+            const newTop = clamp(currentTop + delta, 0, proxyDoc.innerHeight() - proxyScroll.innerHeight());
+            proxyScroll.css("top", newTop);
+            const mainDoc = $(".display .main-doc");
+            const scaleY = mainDoc[0].scrollHeight / proxyDoc.innerHeight();
+            mainDoc.scrollTop(newTop * scaleY)
+        }
+        function getSpanId(el) {
+            return getSpanIds(el)[0]
+        }
+        function getSpanIds(el) {
+            return el.attr("class").split(/\s+/).filter(function (x) {
+                return x.startsWith("span-")
+            });
+        }
+        function createProxy() {
+            const mainDoc = $(".display .main-doc");
+            const proxyDoc = $(".display .proxy-doc");
+            const proxyHeight = proxyDoc.innerHeight();
+            const proxyWidth = proxyDoc.innerWidth();
+            const scaleX = 0.8 * proxyWidth / mainDoc.innerWidth();
+            const scaleY = proxyHeight / mainDoc[0].scrollHeight;
+            const scrollTop = mainDoc.scrollTop();
+            const proxyScrollTop = scrollTop * scaleY;
+            const proxyScrollBottom = (scrollTop + mainDoc.innerHeight()) * scaleY;
+            const proxyScrollHeight = proxyScrollBottom - proxyScrollTop;
+            proxyDoc.empty();
+            // Loop through underlines in doc view and create associated proxy element
+            if (annotateLexical) {
+                $(".display .main-doc .token-underline").each(
+                    function (index, value) {
+                        const el = $(value);
+                        const x = el.position().left;
+                        const y = mainDoc.scrollTop() + el.position().top - mainDoc.position().top;
+                        const newHeight = 3;
+                        const color = el.css("border-bottom-color");
+                        const proxyPadding = proxyDoc.innerWidth() - proxyDoc.width();
+                        const newX = x * scaleX + proxyPadding / 2;
+                        const newY = (y + el.height()) * scaleY - newHeight;
+                        const newWidth = Math.min(
+                            Math.max((el.width() * scaleX) + 1, 5),
+                            proxyDoc.width() + proxyPadding / 2 - newX
+                        );
+                        let classes = "proxy-underline annotation-hidden " + getSpanIds(el).join(" ");
+                        const proxyEl = $('<div/>', {
+                            "class": classes,
+                            "css": {
+                                "position": "absolute",
+                                "left": Math.round(newX),
+                                "top": Math.round(newY),
+                                "background-color": color,
+                                "width": newWidth,
+                                "height": newHeight,
+                            }
+                        }).appendTo(proxyDoc);
+                        proxyEl.data(el.data());
+                    }
+                );
+            }
+            // Loop through all active highlights in doc view and create associated proxy element
+            if (annotateSemantic) {
+                $(".display .main-doc .highlight").each(
+                    function (index, value) {
+                        const el = $(value);
+                        const x = el.position().left;
+                        const y = mainDoc.scrollTop() + el.position().top - mainDoc.position().top;
+                        const newHeight = 5;
+                        const color = el.css("background-color");
+                        const proxyPadding = proxyDoc.innerWidth() - proxyDoc.width()
+                        const newX = x * scaleX + proxyPadding / 2;
+                        const newY = (y + el.height()) * scaleY - newHeight;
+                        const newWidth = Math.min(
+                            Math.max((el.width() * scaleX) + 1, 5),
+                            proxyDoc.width() + proxyPadding / 2 - newX
+                        );
+                        const proxyEl = $('<div/>', {
+                            "class": 'proxy-highlight annotation-hidden',
+                            "css": {
+                                "position": "absolute",
+                                "left": Math.round(newX),
+                                "top": Math.round(newY),
+                                "background-color": color,
+                                "width": newWidth,
+                                "height": newHeight,
+                            }
+                        }).appendTo(proxyDoc);
+                        // Copy data attributes
+                        proxyEl.data(el.data());
+                        // Set classes for matching
+                        proxyEl.addClass(el.data("match-classes"))
+                    }
+                );
+            }
+            $('<div/>', {
+                "class": 'proxy-scroll hidden',
+                "css": {
+                    "top": proxyScrollTop,
+                    "height": proxyScrollHeight,
+                }
+            }).appendTo(proxyDoc);
+            if (hasScroll()) {
+                $(".display .proxy-scroll").removeClass("hidden")
+            }
+            $(".display .proxy-doc")
+                .mousedown(function (event) {
+                    saveDragPos = parseFloat(event.pageY);
+                    isDragging = true;
+                    event.preventDefault();
+                })
+                .mousemove(function (event) {
+                    const dragPos = parseFloat(event.pageY);
+                    if (isDragging) {
+                        const distanceMoved = dragPos - saveDragPos;
+                        scrollBy(distanceMoved);
+                        saveDragPos = dragPos;
+                        event.preventDefault();
+                    }
+                })
+                .mouseup(function (event) {
+                    isDragging = false;
+                })
+                .mouseenter(function () {
+                    disableScrollEvent = true;
+                    $(".display .proxy-scroll").addClass("hover")
+                })
+                .mouseleave(function () {
+                    isDragging = false;
+                    disableScrollEvent = false;
+                    $(".display .proxy-scroll").removeClass("hover")
+                })
+                .on('wheel', function (event) {
+                    scrollBy(event.originalEvent.deltaY / 4);
+                    event.preventDefault();
+                });
+            // TODO: Handle user clicking in scroll region
+            $(".display .main-doc").scroll(function () {
+                if (disableScrollEvent) return;
+                $(".display .proxy-scroll")
+                    .css(
+                        "top", $(this).scrollTop() * scaleY
+                    )
+            })
+        }
+        function resizeend() {
+            if (new Date() - rtime < delta) {
+                setTimeout(resizeend, delta);
+            } else {
+                timeout = false;
+                updateAnnotations();
+                toggleScrollbar();
+            }
+        }
+        function toggleScrollbar() {
+            if (hasScroll()) {
+                $(".display .proxy-scroll").removeClass("hidden");
+            } else {
+                $(".display .proxy-scroll").addClass("hidden");
+            }
+        }
+        function updateAnnotations() {
+            annotateSemantic = $("#option-semantic").hasClass("selected");
+            annotateLexical = $("#option-lexical").hasClass("selected");
+            annotateEntities = $("#option-entity").hasClass("selected");
+            annotateNovel = $("#option-novel").hasClass("selected");
+            if (annotateSemantic || annotateLexical) {
+                $(".summary-item").addClass("selectable")
+            } else {
+                $(".summary-item").removeClass("selectable")
+            }
+            if (annotateLexical) {
+                $(".underline").removeClass("annotation-hidden");
+                $(".summary-item").addClass("annotate-lexical");
+            } else {
+                $(".underline").addClass("annotation-hidden");
+                $(".summary-item").removeClass("annotate-lexical");
+            }
+            if (annotateSemantic) {
+                $(".highlight").removeClass("annotation-hidden");
+            } else {
+                $(".highlight").addClass("annotation-hidden");
+            }
+            if (annotateEntities) {
+                $(".summary-item").addClass("annotate-entities")
+            } else {
+                $(".summary-item").removeClass("annotate-entities")
+            }
+            if (annotateNovel) {
+                $(".summary-item").addClass("annotate-novel")
+            } else {
+                $(".summary-item").removeClass("annotate-novel")
+            }
+            createProxy();
+            if (annotateLexical) {
+                $(".proxy-underline").removeClass("annotation-hidden");
+            } else {
+                $(".proxy-underline").addClass("annotation-hidden");
+            }
+            if (annotateSemantic) {
+                $(".proxy-highlight").removeClass("annotation-hidden");
+            } else {
+                $(".proxy-highlight").addClass("annotation-hidden");
+            }
+            $(".summary-item .highlight").tooltip("disable");
+            if (annotateSemantic) {
+                $(".summary-item.selected .highlight").tooltip("enable")
+            }
+        }
+        function removeDocTooltips() {
+            $("[data-tooltip-timestamp]").tooltip("dispose").removeAttr("data-tooltip-timestamp");
+        }
+        function resetUnderlines() {
+            $('.annotation-invisible').removeClass("annotation-invisible");
+            $('.annotation-inactive').removeClass("annotation-inactive");
+            $('.temp-underline-color')
+                .each(function () {
+                    $(this).css("border-color", $(this).data("primary-color"));
+                })
+                .removeClass("temp-underline-color")
+            $('.temp-proxy-underline-color')
+                .each(function () {
+                    $(this).css("background-color", $(this).data("primary-color"));
+                })
+                .removeClass("temp-proxy-underline-color")
+        }
+        function showDocTooltip(el) {
+            const topDocHighlightId = $(el).data("top-doc-highlight-id");
+            const topDocSim = $(el).data("top-doc-sim");
+            const topHighlight = $(`.display .main-doc .highlight[data-highlight-id=${topDocHighlightId}]`);
+            if (!isViewable(topHighlight)) {
+                return;
+            }
+            topHighlight.tooltip({title: `Most similar (${topDocSim})`, trigger: "manual", container: "body"});
+            topHighlight.tooltip("show");
+            const tooltipTimestamp = Date.now();
+            // Do not use .data() method to set data attributes as they are not searchable
+            topHighlight.attr("data-tooltip-timestamp", tooltipTimestamp);
+            setTimeout(function () {
+                if (topHighlight.data("tooltip-timestamp") == tooltipTimestamp) {
+                    topHighlight.tooltip("dispose").removeAttr("data-tooltip-timestamp");
+                }
+            }, 8000);
+        }
+        function highlightUnderlines() {
+            const spanId = getSpanId($(this));
+            const color = $(this).css("border-bottom-color");
+            // TODO Consolidate into single statement
+            $(`.summary-item.selected .underline.${spanId}`).removeClass("annotation-inactive");
+            $(`.doc .underline.${spanId}`)
+                .removeClass("annotation-inactive")
+                .each(function () {
+                    $(this).css("border-bottom-color", color);
+                })
+                .addClass("temp-underline-color");
+            $(`.proxy-underline.${spanId}`)
+                .removeClass("annotation-inactive")
+                .each(function () {
+                    $(this).css("background-color", color);
+                })
+                .addClass("temp-proxy-underline-color");
+            $(`.summary-item.selected .underline:not(.${spanId})`).addClass("annotation-inactive");
+            $(`.doc .underline:not(.${spanId})`).addClass("annotation-inactive");
+            $(`.proxy-underline:not(.${spanId})`).addClass("annotation-inactive");
+            $(".summary-item.selected .highlight:not(.annotation-hidden)").addClass("annotation-inactive");
+        }
+        function resetHighlights() {
+            removeDocTooltips();
+            $('.summary-item.selected .annotation-inactive').removeClass("annotation-inactive");
+            $('.summary-item.selected .annotation-invisible').removeClass("annotation-invisible");
+            $('.temp-highlight-color')
+                .each(function () {
+                    $(this).css("background-color", $(this).data("primary-color"));
+                })
+                .removeClass("temp-highlight-color");
+            $('.highlight.selected').removeClass("selected");
+            $('.proxy-highlight.selected').removeClass("selected");
+            $('.summary-item [title]').removeAttr("title");
+        }
+        function highlightToken() {
+            const highlightId = $(this).data("highlight-id");
+            $(`.summary-item.selected .highlight:not(.summary-highlight-${highlightId})`).addClass("annotation-inactive");
+            $('.highlight.selected').removeClass("selected")
+            $('.proxy-highlight.selected').removeClass("selected")
+            const matchedDocHighlight = `.display .main-doc .summary-highlight-${highlightId}`;
+            const matchedProxyHighlight = `.proxy-doc .summary-highlight-${highlightId}`;
+            $(matchedDocHighlight + ", " + matchedProxyHighlight)
+                .each(function () {
+                    const newHighlightColor = $(this).data(`color-${highlightId}`);
+                    $(this).css("background-color", newHighlightColor);
+                    $(this).addClass("selected");
+                })
+                .addClass("temp-highlight-color");
+            $(".underline").addClass("annotation-inactive");
+            $(".proxy-underline").addClass("annotation-invisible")
+            showDocTooltip(this);
+            $(this).addClass("selected");
+            $(this).removeClass("annotation-inactive");
+            $('.summary-item [title]').removeAttr("title");
+            if (!isViewable($(matchedDocHighlight))) {
+                $(this).attr("title", "Click to scroll to most similar word.")
+            }
+        }
+        function isViewable(el) {
+            const elTop = el.offset().top;
+            const elBottom = elTop + el.outerHeight();
+            const scrollRegion = $(".display .main-doc");
+            const scrollTop = scrollRegion.offset().top;
+            const scrollBottom = scrollTop + scrollRegion.outerHeight();
+            return elTop > scrollTop && elBottom < scrollBottom;
+        }
+        // Initialization
+        $(function () {
+            $('[data-toggle="tooltip"]').tooltip({
+                // 'boundary': '.summary-container'
+                trigger: 'hover'
+            })
+        })
+        updateAnnotations();
+        // Bind events
+        $(window).resize(function () {
+            rtime = new Date();
+            if (timeout === false) {
+                timeout = true;
+                setTimeout(resizeend, delta);
+            }
+        });
+        $(".summary-list").on(
+            "click",
+            ".summary-item.selectable:not(.selected)",
+            function () {
+                const summary_index = $(this).data("index");
+                // Update summary items
+                $(".summary-item.selected").removeClass("selected")
+                $(this).addClass("selected")
+                // Update doc
+                // Show the version of document aligned with selected summary index
+                $(`.doc[data-index=${summary_index}]`).removeClass("nodisplay").addClass("display");
+                // Hide the version of document not aligned with selected summary index
+                $(`.doc[data-index!=${summary_index}]`).removeClass("display").addClass("nodisplay");
+                updateAnnotations();
+            }
+        );
+        $("#option-lexical").click(function () {
+            $(this).toggleClass("selected")
+            updateAnnotations()
+        });
+        $("#option-semantic").click(function () {
+            $(this).toggleClass("selected")
+            updateAnnotations()
+        });
+        $("#option-novel").click(function () {
+            $(this).toggleClass("selected")
+            updateAnnotations()
+        });
+        $("#option-entity").click(function () {
+            $(this).toggleClass("selected")
+            updateAnnotations()
+        });
+        const activeUnderlines = ".summary-item.selected .underline:not(.annotation-inactive):not(.annotation-hidden)";
+        $(".summary-list").on(
+            "mouseenter",
+            activeUnderlines,
+            function () {
+                highlightUnderlines.call(this);
+            }
+        );
+        $(".summary-list").on(
+            "mouseleave",
+            activeUnderlines,
+            resetUnderlines
+        );
+        $(".summary-list").on(
+            "click",
+            activeUnderlines,
+            function () {
+                // Find aligned underline in doc  and scroll doc to that position
+                highlightUnderlines.call(this);
+                const mainDoc = $(".display .main-doc");
+                const spanId = getSpanId($(this));
+                const matchedUnderline = $(`.doc .underline.${spanId}`);
+                mainDoc.animate({
+                        scrollTop: mainDoc.scrollTop() +
+                            matchedUnderline.offset().top - mainDoc.offset().top - 60
+                    },
+                    300
+                )
+            }
+        );
+        const activeHighlights = ".summary-item.selected .highlight:not(.annotation-hidden):not(.matches-ngram), " +
+            ".summary-item.selected:not(.annotate-lexical) .highlight:not(.annotation-hidden)";
+        $(".summary-list").on(
+            "mouseenter",
+            activeHighlights,
+            function () {
+                highlightToken.call(this);
+            })
+        $(".summary-list").on(
+            "mouseleave",
+            activeHighlights,
+            function () {
+                resetHighlights();
+                resetUnderlines();
+            }
+        );
+        $(".summary-list").on(
+            "click",
+            activeHighlights,
+            function () {
+                highlightToken.call(this);
+                // Find corresponding highlight in doc representing max similarity and scroll doc to that position
+                const topDocHighlightId = $(this).data("top-doc-highlight-id");
+                removeDocTooltips(topDocHighlightId);
+                const topDocHighlight = $(`.display .main-doc .highlight[data-highlight-id=${topDocHighlightId}]`);
+                const mainDoc = $(".display .main-doc");
+                const el = this;
+                mainDoc.animate({
+                        scrollTop: mainDoc.scrollTop() +
+                            topDocHighlight.offset().top - mainDoc.offset().top - 60
+                    },
+                    300,
+                    function () {
+                        setTimeout(
+                            function () {
+                                // If no other tooltips have since been displayed
+                                if ($("[data-tooltip-timestamp]").length == 0) {
+                                    showDocTooltip(el);
+                                } else {
+                                    console.log("Not showing tooltip because one already exists")
+                                }
+                            },
+                            100
+                        )
+                    }
+                )
+            }
+        );
+        $(".summary-list").on(
+            "mouseleave",
+            ".summary-item.selected .content",
+            function () {
+                resetHighlights();
+                resetUnderlines();
+            },
+        );
+    }
+);

utils.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import re
+def preprocess_text(text):
+    split_punct = re.escape(r'()')
+    return ' '.join(re.findall(rf"[^\s{split_punct}]+|[{split_punct}]", text))

website/annotations.png ADDED Viewed

website/demo.gif ADDED Viewed

website/main-vis.jpg ADDED Viewed

website/title.png ADDED Viewed

website/triangle.png ADDED Viewed