Spaces:
Runtime error
Runtime error
Commit
·
6124176
unverified
·
0
Parent(s):
Initial commit
Browse files- .gitattributes +18 -0
- .gitignore +1 -0
- LICENSE +201 -0
- README.md +372 -0
- align.py +346 -0
- app.py +375 -0
- components.py +563 -0
- data/10:cnn_dailymail_1000.validation/_dataset/data.gz +3 -0
- data/10:cnn_dailymail_1000.validation/metadata.json +1 -0
- generation.py +142 -0
- preprocessing.py +701 -0
- requirements.txt +11 -0
- resources/jquery.color-2.1.2.min.js +2 -0
- resources/summvis.css +347 -0
- resources/summvis.js +518 -0
- utils.py +6 -0
- website/annotations.png +0 -0
- website/demo.gif +0 -0
- website/main-vis.jpg +0 -0
- website/title.png +0 -0
- website/triangle.png +0 -0
.gitattributes
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.DS_STORE
|
LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright 2021 SummVis
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
README.md
ADDED
@@ -0,0 +1,372 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# SummVis
|
2 |
+
|
3 |
+
SummVis is an open-source visualization tool that supports fine-grained analysis of summarization models, data, and evaluation
|
4 |
+
metrics. Through its lexical and semantic visualizations, SummVis enables in-depth exploration across important dimensions such as factual consistency and abstractiveness.
|
5 |
+
|
6 |
+
Authors: [Jesse Vig](https://twitter.com/jesse_vig)<sup>1</sup>,
|
7 |
+
[Wojciech Kryściński](https://twitter.com/iam_wkr)<sup>1</sup>,
|
8 |
+
[Karan Goel](https://twitter.com/krandiash)<sup>2</sup>,
|
9 |
+
[Nazneen Fatema Rajani](https://twitter.com/nazneenrajani)<sup>1</sup><br/>
|
10 |
+
<sup>1</sup>[Salesforce Research](https://einstein.ai/) <sup>2</sup>[Stanford Hazy Research](https://hazyresearch.stanford.edu/)
|
11 |
+
|
12 |
+
📖 [Paper](https://arxiv.org/abs/2104.07605)
|
13 |
+
🎥 [Demo](https://vimeo.com/540429745)
|
14 |
+
|
15 |
+
<p>
|
16 |
+
<img src="website/demo.gif" alt="Demo gif"/>
|
17 |
+
</p>
|
18 |
+
|
19 |
+
_Note: SummVis is under active development, so expect continued updates in the coming weeks and months.
|
20 |
+
Feel free to raise issues for questions, suggestions, requests or bug reports._
|
21 |
+
|
22 |
+
## Table of Contents
|
23 |
+
- [User guide](#user-guide)
|
24 |
+
- [Installation](#installation)
|
25 |
+
- [Quickstart](#quickstart)
|
26 |
+
- [Running with pre-loaded datasets](#running-with-pre-loaded-datasets)
|
27 |
+
- [Get your data into SummVis](#get-your-data-into-summvis)
|
28 |
+
- [Citation](#citation)
|
29 |
+
- [Acknowledgements](#acknowledgements)
|
30 |
+
|
31 |
+
## User guide
|
32 |
+
|
33 |
+
### Overview
|
34 |
+
SummVis is a tool for analyzing abstractive summarization systems. It provides fine-grained insights on summarization
|
35 |
+
models, data, and evaluation metrics by visualizing the relationships between source documents, reference summaries,
|
36 |
+
and generated summaries, as illustrated in the figure below.<br/>
|
37 |
+
|
38 |
+

|
39 |
+
|
40 |
+
### Interface
|
41 |
+
|
42 |
+
The SummVis interface is shown below. The example displayed is the first record from the
|
43 |
+
[CNN / Daily Mail](https://huggingface.co/datasets/cnn_dailymail) validation set.
|
44 |
+
|
45 |
+

|
46 |
+
|
47 |
+
|
48 |
+
#### Components
|
49 |
+
|
50 |
+
**(a)** Configuration panel<br/>
|
51 |
+
**(b)** Source document (or reference summary, depending on configuration)<br/>
|
52 |
+
**(c)** Generated summaries (and/or reference summary, depending on configuration)<br/>
|
53 |
+
**(d)** Scroll bar with global view of annotations<br/>
|
54 |
+
|
55 |
+
#### Annotations
|
56 |
+
<img src="website/annotations.png" width="548" height="39" alt="Annotations"/>
|
57 |
+
|
58 |
+
**N-gram overlap:** Word sequences that overlap between the document on the left and
|
59 |
+
the selected summary on the right. Underlines are color-coded by index of summary sentence. <br/>
|
60 |
+
**Semantic overlap**: Words in the summary that are semantically close to one or more words in document on the left.<br/>
|
61 |
+
**Novel words**: Words in the summary that do not appear in the document on the left.<br/>
|
62 |
+
**Novel entities**: Entity words in the summary that do not appear in the document on the left.<br/>
|
63 |
+
|
64 |
+
### Limitations
|
65 |
+
Currently only English text is supported.
|
66 |
+
|
67 |
+
## Installation
|
68 |
+
**IMPORTANT**: Please use `python>=3.8` since some dependencies require that for installation.
|
69 |
+
```shell
|
70 |
+
# Requires python>=3.8
|
71 |
+
git clone https://github.com/robustness-gym/summvis.git
|
72 |
+
cd summvis
|
73 |
+
pip install -r requirements.txt
|
74 |
+
python -m spacy download en_core_web_sm
|
75 |
+
```
|
76 |
+
|
77 |
+
Installation takes around 2 minutes on a Macbook Pro.
|
78 |
+
|
79 |
+
## Quickstart
|
80 |
+
Follow the steps below to start using SummVis immediately.
|
81 |
+
|
82 |
+
### 1. Download and extract data
|
83 |
+
Download our pre-cached dataset that contains predictions for state-of-the-art models such as PEGASUS and BART on
|
84 |
+
1000 examples taken from the CNN / Daily Mail validation set.
|
85 |
+
```shell
|
86 |
+
mkdir data
|
87 |
+
mkdir preprocessing
|
88 |
+
curl https://storage.googleapis.com/sfr-summvis-data-research/cnn_dailymail_1000.validation.anonymized.zip --output preprocessing/cnn_dailymail_1000.validation.anonymized.zip
|
89 |
+
unzip preprocessing/cnn_dailymail_1000.validation.anonymized.zip -d preprocessing/
|
90 |
+
```
|
91 |
+
|
92 |
+
### 2. Deanonymize data
|
93 |
+
Next, we'll need to add the original examples from the CNN / Daily Mail dataset to deanonymize the data (this information
|
94 |
+
is omitted for copyright reasons). The `preprocessing.py` script can be used for this with the `--deanonymize` flag.
|
95 |
+
|
96 |
+
#### Deanonymize 10 examples:
|
97 |
+
```shell
|
98 |
+
python preprocessing.py \
|
99 |
+
--deanonymize \
|
100 |
+
--dataset_rg preprocessing/cnn_dailymail_1000.validation.anonymized \
|
101 |
+
--dataset cnn_dailymail \
|
102 |
+
--version 3.0.0 \
|
103 |
+
--split validation \
|
104 |
+
--processed_dataset_path data/10:cnn_dailymail_1000.validation \
|
105 |
+
--n_samples 10
|
106 |
+
```
|
107 |
+
This will take either a few seconds or a few minutes depending on whether you've previously loaded CNN/DailyMail from
|
108 |
+
the Datasets library.
|
109 |
+
|
110 |
+
### 3. Run SummVis
|
111 |
+
Finally, we're ready to run the Streamlit app. Once the app loads, make sure it's pointing to the right `File` at the top
|
112 |
+
of the interface.
|
113 |
+
```shell
|
114 |
+
streamlit run summvis.py
|
115 |
+
```
|
116 |
+
|
117 |
+
## Running with pre-loaded datasets
|
118 |
+
|
119 |
+
In this section we extend the approach described in [Quickstart](#quickstart) to other pre-loaded datasets.
|
120 |
+
|
121 |
+
### 1. Download one of the pre-loaded datasets:
|
122 |
+
|
123 |
+
##### CNN / Daily Mail (1000 examples from validation set): https://storage.googleapis.com/sfr-summvis-data-research/cnn_dailymail_1000.validation.anonymized.zip
|
124 |
+
##### CNN / Daily Mail (full validation set): https://storage.googleapis.com/sfr-summvis-data-research/cnn_dailymail.validation.anonymized.zip
|
125 |
+
##### XSum (1000 examples from validation set): https://storage.googleapis.com/sfr-summvis-data-research/xsum_1000.validation.anonymized.zip
|
126 |
+
##### XSum (full validation set): https://storage.googleapis.com/sfr-summvis-data-research/xsum.validation.anonymized.zip
|
127 |
+
|
128 |
+
We recommend that you choose the smallest dataset that fits your need in order to minimize download / preprocessing time.
|
129 |
+
|
130 |
+
#### Example: Download and unzip CNN / Daily Mail
|
131 |
+
```shell
|
132 |
+
mkdir data
|
133 |
+
mkdir preprocessing
|
134 |
+
curl https://storage.googleapis.com/sfr-summvis-data-research/cnn_dailymail_1000.validation.anonymized.zip --output preprocessing/cnn_dailymail_1000.validation.anonymized.zip
|
135 |
+
unzip preprocessing/cnn_dailymail_1000.validation.anonymized.zip -d preprocessing/
|
136 |
+
```
|
137 |
+
|
138 |
+
#### Example: Download and unzip XSum
|
139 |
+
```shell
|
140 |
+
mkdir data
|
141 |
+
mkdir preprocessing
|
142 |
+
curl https://storage.googleapis.com/sfr-summvis-data-research/xsum_1000.validation.anonymized.zip --output preprocessing/xsum_1000.validation.anonymized.zip
|
143 |
+
unzip preprocessing/xsum_1000.validation.anonymized.zip -d preprocessing/
|
144 |
+
```
|
145 |
+
|
146 |
+
### 2. Deanonymize *n* examples:
|
147 |
+
|
148 |
+
Set the `--n_samples` argument and name the `--processed_dataset_path` output file accordingly.
|
149 |
+
|
150 |
+
#### Example: Deanonymize 100 examples from CNN / Daily Mail:
|
151 |
+
```shell
|
152 |
+
python preprocessing.py \
|
153 |
+
--deanonymize \
|
154 |
+
--dataset_rg preprocessing/cnn_dailymail_1000.validation.anonymized \
|
155 |
+
--dataset cnn_dailymail \
|
156 |
+
--version 3.0.0 \
|
157 |
+
--split validation \
|
158 |
+
--processed_dataset_path data/100:cnn_dailymail_1000.validation \
|
159 |
+
--n_samples 100
|
160 |
+
```
|
161 |
+
|
162 |
+
#### Example: Deanonymize all pre-loaded examples from CNN / Daily Mail (1000 examples dataset):
|
163 |
+
```shell
|
164 |
+
python preprocessing.py \
|
165 |
+
--deanonymize \
|
166 |
+
--dataset_rg preprocessing/cnn_dailymail_1000.validation.anonymized \
|
167 |
+
--dataset cnn_dailymail \
|
168 |
+
--version 3.0.0 \
|
169 |
+
--split validation \
|
170 |
+
--processed_dataset_path data/full:cnn_dailymail_1000.validation \
|
171 |
+
--n_samples 1000
|
172 |
+
```
|
173 |
+
|
174 |
+
#### Example: Deanonymize all pre-loaded examples from CNN / Daily Mail (full dataset):
|
175 |
+
```shell
|
176 |
+
python preprocessing.py \
|
177 |
+
--deanonymize \
|
178 |
+
--dataset_rg preprocessing/cnn_dailymail.validation.anonymized \
|
179 |
+
--dataset cnn_dailymail \
|
180 |
+
--version 3.0.0 \
|
181 |
+
--split validation \
|
182 |
+
--processed_dataset_path data/full:cnn_dailymail.validation
|
183 |
+
```
|
184 |
+
|
185 |
+
#### Example: Deanonymize all pre-loaded examples from XSum (1000 examples dataset):
|
186 |
+
```shell
|
187 |
+
python preprocessing.py \
|
188 |
+
--deanonymize \
|
189 |
+
--dataset_rg preprocessing/xsum_1000.validation.anonymized \
|
190 |
+
--dataset xsum \
|
191 |
+
--split validation \
|
192 |
+
--processed_dataset_path data/full:xsum_1000.validation \
|
193 |
+
--n_samples 1000
|
194 |
+
```
|
195 |
+
|
196 |
+
### 3. Run SummVis
|
197 |
+
Once the app loads, make sure it's pointing to the right `File` at the top
|
198 |
+
of the interface.
|
199 |
+
```shell
|
200 |
+
streamlit run summvis.py
|
201 |
+
```
|
202 |
+
|
203 |
+
Alternately, if you need to point SummVis to a folder where your data is stored.
|
204 |
+
```shell
|
205 |
+
streamlit run summvis.py -- --path your/path/to/data
|
206 |
+
```
|
207 |
+
Note that the additional `--` is not a mistake, and is required to pass command-line arguments in streamlit.
|
208 |
+
|
209 |
+
|
210 |
+
## Get your data into SummVis
|
211 |
+
|
212 |
+
The simplest way to use SummVis with your own data is to create a jsonl file of the following format:
|
213 |
+
|
214 |
+
```
|
215 |
+
{"document": "This is the first source document", "summary:reference": "This is the reference summary", "summary:testmodel1": "This is the summary for testmodel1", "summary:testmodel2": "This is the summary for testmodel2"}
|
216 |
+
{"document": "This is the second source document", "summary:reference": "This is the reference summary", "summary:testmodel1": "This is the summary for testmodel1", "summary:testmodel2": "This is the summary for testmodel2"}
|
217 |
+
```
|
218 |
+
|
219 |
+
The key for the reference summary must equal `summary:reference` and the key for any other summary must be of the form
|
220 |
+
`summary:<summary_name>`, e.g. `summary:BART`. The document and at least one summary (reference, other, or both) are required.
|
221 |
+
|
222 |
+
The following additional install step is required.:
|
223 |
+
```
|
224 |
+
python -m spacy download en_core_web_lg
|
225 |
+
```
|
226 |
+
|
227 |
+
You have two options to load this jsonl file into the tool:
|
228 |
+
|
229 |
+
#### Option 1: Load the jsonl file directly
|
230 |
+
|
231 |
+
The disadvantage of this approach is that all computations are performed in realtime. This is particularly expensive for
|
232 |
+
semantic similarity, which uses a Transformer model. At a result, each example will be slow to load (~5-15 seconds on a Macbook Pro).
|
233 |
+
|
234 |
+
1. Place the jsonl file in the `data` directory. Note that the file must be named with a `.jsonl` extension.
|
235 |
+
2. Start SummVis: `streamlit run summvis.py`
|
236 |
+
3. Select your jsonl file from the `File` dropdown at the top of the interface.
|
237 |
+
|
238 |
+
#### Option 2: Preprocess jsonl file (recommended)
|
239 |
+
|
240 |
+
You may run `preprocessing.py` to precompute all data required in the interface (running `spaCy`, lexical and semantic
|
241 |
+
aligners) and save a cache file, which can be read directly into the tool. Note that this script may run for a while
|
242 |
+
(~5-15 seconds per example on a MacBook Pro for
|
243 |
+
documents of typical length found in CNN/DailyMail or XSum), and will be greatly expedited by running on a GPU.
|
244 |
+
|
245 |
+
1. Run preprocessing script to generate cache file
|
246 |
+
```shell
|
247 |
+
python preprocessing.py \
|
248 |
+
--workflow \
|
249 |
+
--dataset_jsonl path/to/my_dataset.jsonl \
|
250 |
+
--processed_dataset_path path/to/my_cache_file
|
251 |
+
```
|
252 |
+
You may wish to first try it with a subset of your data by adding the following argument: `--n_samples <number_of_samples>`.
|
253 |
+
|
254 |
+
2. Copy output cache file to the `data` directory
|
255 |
+
3. Start SummVis: `streamlit run summvis.py`
|
256 |
+
4. Select your file from the `File` dropdown at the top of the interface.
|
257 |
+
|
258 |
+
As an alternative to steps 2-3, you may point SummVis to a folder in which the cache file is stored:
|
259 |
+
```shell
|
260 |
+
streamlit run summvis.py -- --path <parent_directory_of_cache_file>
|
261 |
+
```
|
262 |
+
### Generating predictions
|
263 |
+
The instructions in the previous section assume access to model predictions. We also provide tools to load predictions,
|
264 |
+
either by downloading datasets with precomputed predictions or running
|
265 |
+
a script to generate predictions for HuggingFace-compatible models. In this section we describe an end-to-end pipeline
|
266 |
+
for using these tools.
|
267 |
+
|
268 |
+
|
269 |
+
Prior to running the following, an additional install step is required:
|
270 |
+
|
271 |
+
```
|
272 |
+
python -m spacy download en_core_web_lg
|
273 |
+
```
|
274 |
+
|
275 |
+
#### 1. Standardize and save dataset to disk.
|
276 |
+
Loads in a dataset from HF, or any dataset that you have and stores it in a
|
277 |
+
standardized format with columns for `document` and `summary:reference`.
|
278 |
+
|
279 |
+
##### Example: Save CNN / Daily Mail validation split to disk as a jsonl file.
|
280 |
+
```shell
|
281 |
+
python preprocessing.py \
|
282 |
+
--standardize \
|
283 |
+
--dataset cnn_dailymail \
|
284 |
+
--version 3.0.0 \
|
285 |
+
--split validation \
|
286 |
+
--save_jsonl_path preprocessing/cnn_dailymail.validation.jsonl
|
287 |
+
```
|
288 |
+
|
289 |
+
##### Example: Load custom `my_dataset.jsonl`, standardize, and save.
|
290 |
+
```shell
|
291 |
+
python preprocessing.py \
|
292 |
+
--standardize \
|
293 |
+
--dataset_jsonl path/to/my_dataset.jsonl \
|
294 |
+
--save_jsonl_path preprocessing/my_dataset.jsonl
|
295 |
+
```
|
296 |
+
|
297 |
+
Expected format of `my_dataset.jsonl`:
|
298 |
+
```
|
299 |
+
{"document": "This is the first source document", "summary:reference": "This is the reference summary"}
|
300 |
+
{"document": "This is the second source document", "summary:reference": "This is the reference summary"}
|
301 |
+
```
|
302 |
+
|
303 |
+
If you wish to use column names other than `document` and `summary:reference`, you may specify custom column names
|
304 |
+
using the `doc_column` and `reference_column` command-line arguments.
|
305 |
+
|
306 |
+
|
307 |
+
#### 2. Add predictions to the saved dataset.
|
308 |
+
Takes a saved dataset that has already been standardized and adds predictions to it
|
309 |
+
from prediction jsonl files. Cached predictions for several models available here:
|
310 |
+
https://storage.googleapis.com/sfr-summvis-data-research/predictions.zip
|
311 |
+
|
312 |
+
You may also generate your own predictions using this [this script](generation.py).
|
313 |
+
|
314 |
+
##### Example: Add 6 prediction files for PEGASUS and BART to the dataset.
|
315 |
+
```shell
|
316 |
+
python preprocessing.py \
|
317 |
+
--join_predictions \
|
318 |
+
--dataset_jsonl preprocessing/cnn_dailymail.validation.jsonl \
|
319 |
+
--prediction_jsonls \
|
320 |
+
predictions/bart-cnndm.cnndm.validation.results.anonymized \
|
321 |
+
predictions/bart-xsum.cnndm.validation.results.anonymized \
|
322 |
+
predictions/pegasus-cnndm.cnndm.validation.results.anonymized \
|
323 |
+
predictions/pegasus-multinews.cnndm.validation.results.anonymized \
|
324 |
+
predictions/pegasus-newsroom.cnndm.validation.results.anonymized \
|
325 |
+
predictions/pegasus-xsum.cnndm.validation.results.anonymized \
|
326 |
+
--save_jsonl_path preprocessing/cnn_dailymail.validation.jsonl
|
327 |
+
```
|
328 |
+
|
329 |
+
#### 3. Run the preprocessing workflow and save the dataset.
|
330 |
+
Takes a saved dataset that has been standardized, and predictions already added.
|
331 |
+
Applies all the preprocessing steps to it (running `spaCy`, lexical and semantic aligners),
|
332 |
+
and stores the processed dataset back to disk.
|
333 |
+
|
334 |
+
##### Example: Autorun with default settings on a few examples to try it.
|
335 |
+
```shell
|
336 |
+
python preprocessing.py \
|
337 |
+
--workflow \
|
338 |
+
--dataset_jsonl preprocessing/cnn_dailymail.validation.jsonl \
|
339 |
+
--processed_dataset_path data/cnn_dailymail.validation \
|
340 |
+
--try_it
|
341 |
+
```
|
342 |
+
|
343 |
+
##### Example: Autorun with default settings on all examples.
|
344 |
+
```shell
|
345 |
+
python preprocessing.py \
|
346 |
+
--workflow \
|
347 |
+
--dataset_jsonl preprocessing/cnn_dailymail.validation.jsonl \
|
348 |
+
--processed_dataset_path data/cnn_dailymail
|
349 |
+
```
|
350 |
+
|
351 |
+
|
352 |
+
## Citation
|
353 |
+
|
354 |
+
When referencing this repository, please cite [this paper](https://arxiv.org/abs/2104.07605):
|
355 |
+
|
356 |
+
```
|
357 |
+
@misc{vig2021summvis,
|
358 |
+
title={SummVis: Interactive Visual Analysis of Models, Data, and Evaluation for Text Summarization},
|
359 |
+
author={Jesse Vig and Wojciech Kryscinski and Karan Goel and Nazneen Fatema Rajani},
|
360 |
+
year={2021},
|
361 |
+
eprint={2104.07605},
|
362 |
+
archivePrefix={arXiv},
|
363 |
+
primaryClass={cs.CL},
|
364 |
+
url={https://arxiv.org/abs/2104.07605}
|
365 |
+
}
|
366 |
+
```
|
367 |
+
|
368 |
+
## Acknowledgements
|
369 |
+
|
370 |
+
We thank [Michael Correll](http://correll.io) for his valuable feedback.
|
371 |
+
|
372 |
+
|
align.py
ADDED
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import heapq
|
2 |
+
import itertools
|
3 |
+
from abc import ABC, abstractmethod
|
4 |
+
from collections import defaultdict
|
5 |
+
from operator import itemgetter
|
6 |
+
from typing import List, Dict, Tuple
|
7 |
+
from typing import Sequence
|
8 |
+
|
9 |
+
import numpy as np
|
10 |
+
import torch
|
11 |
+
from bert_score import BERTScorer
|
12 |
+
from nltk import PorterStemmer
|
13 |
+
from spacy.tokens import Doc, Span
|
14 |
+
from toolz import itertoolz
|
15 |
+
from transformers import AutoTokenizer
|
16 |
+
from transformers.tokenization_utils_base import PaddingStrategy
|
17 |
+
|
18 |
+
|
19 |
+
class EmbeddingModel(ABC):
|
20 |
+
@abstractmethod
|
21 |
+
def embed(
|
22 |
+
self,
|
23 |
+
sents: List[Span]
|
24 |
+
):
|
25 |
+
pass
|
26 |
+
|
27 |
+
|
28 |
+
class ContextualEmbedding(EmbeddingModel):
|
29 |
+
|
30 |
+
def __init__(self, model, tokenizer_name, max_length):
|
31 |
+
self.model = model
|
32 |
+
self.tokenizer = SpacyHuggingfaceTokenizer(tokenizer_name, max_length)
|
33 |
+
self._device = model.device
|
34 |
+
|
35 |
+
def embed(
|
36 |
+
self,
|
37 |
+
sents: List[Span]
|
38 |
+
):
|
39 |
+
encoded_input, special_tokens_masks, token_alignments = self.tokenizer.batch_encode(sents)
|
40 |
+
encoded_input = {k: v.to(self._device) for k, v in encoded_input.items()}
|
41 |
+
with torch.no_grad():
|
42 |
+
model_output = self.model(**encoded_input)
|
43 |
+
embeddings = model_output[0].cpu()
|
44 |
+
|
45 |
+
spacy_embs_list = []
|
46 |
+
for embs, mask, token_alignment \
|
47 |
+
in zip(embeddings, special_tokens_masks, token_alignments):
|
48 |
+
mask = torch.tensor(mask)
|
49 |
+
embs = embs[mask == 0] # Filter embeddings at special token positions
|
50 |
+
spacy_embs = []
|
51 |
+
for hf_idxs in token_alignment:
|
52 |
+
if hf_idxs is None:
|
53 |
+
pooled_embs = torch.zeros_like(embs[0])
|
54 |
+
else:
|
55 |
+
pooled_embs = embs[hf_idxs].mean(dim=0) # Pool embeddings that map to the same spacy token
|
56 |
+
spacy_embs.append(pooled_embs.numpy())
|
57 |
+
spacy_embs = np.stack(spacy_embs)
|
58 |
+
spacy_embs = spacy_embs / np.linalg.norm(spacy_embs, axis=-1, keepdims=True) # Normalize
|
59 |
+
spacy_embs_list.append(spacy_embs)
|
60 |
+
for embs, sent in zip(spacy_embs_list, sents):
|
61 |
+
assert len(embs) == len(sent)
|
62 |
+
return spacy_embs_list
|
63 |
+
|
64 |
+
|
65 |
+
class StaticEmbedding(EmbeddingModel):
|
66 |
+
|
67 |
+
def embed(
|
68 |
+
self,
|
69 |
+
sents: List[Span]
|
70 |
+
):
|
71 |
+
return [
|
72 |
+
np.stack([t.vector / (t.vector_norm or 1) for t in sent])
|
73 |
+
for sent in sents
|
74 |
+
]
|
75 |
+
|
76 |
+
|
77 |
+
class EmbeddingAligner():
|
78 |
+
|
79 |
+
def __init__(
|
80 |
+
self,
|
81 |
+
embedding: EmbeddingModel,
|
82 |
+
threshold: float,
|
83 |
+
top_k: int,
|
84 |
+
baseline_val=0
|
85 |
+
):
|
86 |
+
self.threshold = threshold
|
87 |
+
self.top_k = top_k
|
88 |
+
self.embedding = embedding
|
89 |
+
self.baseline_val = baseline_val
|
90 |
+
|
91 |
+
def align(
|
92 |
+
self,
|
93 |
+
source: Doc,
|
94 |
+
targets: Sequence[Doc]
|
95 |
+
) -> List[Dict]:
|
96 |
+
"""Compute alignment from summary tokens to doc tokens with greatest semantic similarity
|
97 |
+
Args:
|
98 |
+
source: Source spaCy document
|
99 |
+
targets: Target spaCy documents
|
100 |
+
Returns: List of alignments, one for each target document
|
101 |
+
"""
|
102 |
+
if len(source) == 0:
|
103 |
+
return [{} for _ in targets]
|
104 |
+
all_sents = list(source.sents) + list(itertools.chain.from_iterable(target.sents for target in targets))
|
105 |
+
chunk_sizes = [_iter_len(source.sents)] + \
|
106 |
+
[_iter_len(target.sents) for target in targets]
|
107 |
+
all_sents_token_embeddings = self.embedding.embed(all_sents)
|
108 |
+
chunked_sents_token_embeddings = _split(all_sents_token_embeddings, chunk_sizes)
|
109 |
+
source_sent_token_embeddings = chunked_sents_token_embeddings[0]
|
110 |
+
source_token_embeddings = np.concatenate(source_sent_token_embeddings)
|
111 |
+
for token_idx, token in enumerate(source):
|
112 |
+
if token.is_stop or token.is_punct:
|
113 |
+
source_token_embeddings[token_idx] = 0
|
114 |
+
alignments = []
|
115 |
+
for i, target in enumerate(targets):
|
116 |
+
target_sent_token_embeddings = chunked_sents_token_embeddings[i + 1]
|
117 |
+
target_token_embeddings = np.concatenate(target_sent_token_embeddings)
|
118 |
+
for token_idx, token in enumerate(target):
|
119 |
+
if token.is_stop or token.is_punct:
|
120 |
+
target_token_embeddings[token_idx] = 0
|
121 |
+
alignment = defaultdict(list)
|
122 |
+
for score, target_idx, source_idx in self._emb_sim_sparse(
|
123 |
+
target_token_embeddings,
|
124 |
+
source_token_embeddings,
|
125 |
+
):
|
126 |
+
alignment[target_idx].append((source_idx, score))
|
127 |
+
# TODO used argpartition to get nlargest
|
128 |
+
for j in list(alignment):
|
129 |
+
alignment[j] = heapq.nlargest(self.top_k, alignment[j], itemgetter(1))
|
130 |
+
alignments.append(alignment)
|
131 |
+
return alignments
|
132 |
+
|
133 |
+
def _emb_sim_sparse(self, embs_1, embs_2):
|
134 |
+
sim = embs_1 @ embs_2.T
|
135 |
+
sim = (sim - self.baseline_val) / (1 - self.baseline_val)
|
136 |
+
keep = sim > self.threshold
|
137 |
+
keep_idxs_1, keep_idxs_2 = np.where(keep)
|
138 |
+
keep_scores = sim[keep]
|
139 |
+
return list(zip(keep_scores, keep_idxs_1, keep_idxs_2))
|
140 |
+
|
141 |
+
|
142 |
+
class BertscoreAligner(EmbeddingAligner):
|
143 |
+
def __init__(
|
144 |
+
self,
|
145 |
+
threshold,
|
146 |
+
top_k
|
147 |
+
):
|
148 |
+
scorer = BERTScorer(lang="en", rescale_with_baseline=True)
|
149 |
+
model = scorer._model
|
150 |
+
embedding = ContextualEmbedding(model, "roberta-large", 510)
|
151 |
+
baseline_val = scorer.baseline_vals[2].item()
|
152 |
+
|
153 |
+
super(BertscoreAligner, self).__init__(
|
154 |
+
embedding, threshold, top_k, baseline_val
|
155 |
+
)
|
156 |
+
|
157 |
+
|
158 |
+
class StaticEmbeddingAligner(EmbeddingAligner):
|
159 |
+
def __init__(
|
160 |
+
self,
|
161 |
+
threshold,
|
162 |
+
top_k
|
163 |
+
):
|
164 |
+
embedding = StaticEmbedding()
|
165 |
+
super(StaticEmbeddingAligner, self).__init__(
|
166 |
+
embedding, threshold, top_k
|
167 |
+
)
|
168 |
+
|
169 |
+
|
170 |
+
class NGramAligner():
|
171 |
+
|
172 |
+
def __init__(self):
|
173 |
+
self.stemmer = PorterStemmer()
|
174 |
+
|
175 |
+
def align(
|
176 |
+
self,
|
177 |
+
source: Doc,
|
178 |
+
targets: List[Doc],
|
179 |
+
) -> List[Dict]:
|
180 |
+
|
181 |
+
alignments = []
|
182 |
+
source_ngram_spans = self._get_ngram_spans(source)
|
183 |
+
for target in targets:
|
184 |
+
target_ngram_spans = self._get_ngram_spans(target)
|
185 |
+
alignments.append(
|
186 |
+
self._align_ngrams(target_ngram_spans, source_ngram_spans)
|
187 |
+
)
|
188 |
+
return alignments
|
189 |
+
|
190 |
+
def _get_ngram_spans(
|
191 |
+
self,
|
192 |
+
doc: Doc,
|
193 |
+
):
|
194 |
+
ngrams = []
|
195 |
+
for sent in doc.sents:
|
196 |
+
for n in range(1, len(list(sent))):
|
197 |
+
tokens = [t for t in sent if not (t.is_stop or t.is_punct)]
|
198 |
+
ngrams.extend(_ngrams(tokens, n))
|
199 |
+
|
200 |
+
def ngram_key(ngram):
|
201 |
+
return tuple(self.stemmer.stem(token.text).lower() for token in ngram)
|
202 |
+
|
203 |
+
key_to_ngrams = itertoolz.groupby(ngram_key, ngrams)
|
204 |
+
key_to_spans = {}
|
205 |
+
for k, grouped_ngrams in key_to_ngrams.items():
|
206 |
+
key_to_spans[k] = [
|
207 |
+
(ngram[0].i, ngram[-1].i + 1)
|
208 |
+
for ngram in grouped_ngrams
|
209 |
+
]
|
210 |
+
return key_to_spans
|
211 |
+
|
212 |
+
def _align_ngrams(
|
213 |
+
self,
|
214 |
+
ngram_spans_1: Dict[Tuple[str], List[Tuple[int, int]]],
|
215 |
+
ngram_spans_2: Dict[Tuple[str], List[Tuple[int, int]]]
|
216 |
+
) -> Dict[Tuple[int, int], List[Tuple[int, int]]]:
|
217 |
+
"""Align ngram spans between two documents
|
218 |
+
Args:
|
219 |
+
ngram_spans_1: Map from (normalized_token1, normalized_token2, ...) n-gram tuple to a list of token spans
|
220 |
+
of format (start_pos, end_pos)
|
221 |
+
ngram_spans_2: Same format as above, but for second text
|
222 |
+
Returns: map from each (start, end) span in text 1 to list of aligned (start, end) spans in text 2
|
223 |
+
"""
|
224 |
+
if not ngram_spans_1 or not ngram_spans_2:
|
225 |
+
return {}
|
226 |
+
max_span_end_1 = max(span[1] for span in itertools.chain.from_iterable(ngram_spans_1.values()))
|
227 |
+
token_is_available_1 = [True] * max_span_end_1 #
|
228 |
+
matched_keys = list(set(ngram_spans_1.keys()) & set(ngram_spans_2.keys())) # Matched normalized ngrams betwee
|
229 |
+
matched_keys.sort(key=len, reverse=True) # Process n-grams from longest to shortest
|
230 |
+
|
231 |
+
alignment = defaultdict(list) # Map from each matched span in text 1 to list of aligned spans in text 2
|
232 |
+
for key in matched_keys:
|
233 |
+
spans_1 = ngram_spans_1[key]
|
234 |
+
spans_2 = ngram_spans_2[key]
|
235 |
+
available_spans_1 = [span for span in spans_1 if all(token_is_available_1[slice(*span)])]
|
236 |
+
matched_spans_1 = []
|
237 |
+
if available_spans_1 and spans_2:
|
238 |
+
# if ngram can be matched to available spans in both sequences
|
239 |
+
for span in available_spans_1:
|
240 |
+
# It's possible that these newly matched spans may be overlapping with one another, so
|
241 |
+
# check that token positions still available (only one span allowed ber token in text 1):
|
242 |
+
if all(token_is_available_1[slice(*span)]):
|
243 |
+
matched_spans_1.append(span)
|
244 |
+
token_is_available_1[slice(*span)] = [False] * (span[1] - span[0])
|
245 |
+
for span1 in matched_spans_1:
|
246 |
+
alignment[span1] = spans_2
|
247 |
+
|
248 |
+
return alignment
|
249 |
+
|
250 |
+
|
251 |
+
class SpacyHuggingfaceTokenizer:
|
252 |
+
def __init__(
|
253 |
+
self,
|
254 |
+
model_name,
|
255 |
+
max_length
|
256 |
+
):
|
257 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
|
258 |
+
self.max_length = max_length
|
259 |
+
|
260 |
+
def batch_encode(
|
261 |
+
self,
|
262 |
+
sents: List[Span]
|
263 |
+
):
|
264 |
+
token_alignments = []
|
265 |
+
token_ids_list = []
|
266 |
+
|
267 |
+
# Tokenize each sentence and special tokens.
|
268 |
+
for sent in sents:
|
269 |
+
hf_tokens, token_alignment = self.tokenize(sent)
|
270 |
+
token_alignments.append(token_alignment)
|
271 |
+
token_ids = self.tokenizer.convert_tokens_to_ids(hf_tokens)
|
272 |
+
encoding = self.tokenizer.prepare_for_model(
|
273 |
+
token_ids,
|
274 |
+
add_special_tokens=True,
|
275 |
+
padding=False,
|
276 |
+
)
|
277 |
+
token_ids_list.append(encoding['input_ids'])
|
278 |
+
|
279 |
+
# Add padding
|
280 |
+
max_length = max(map(len, token_ids_list))
|
281 |
+
attention_mask = []
|
282 |
+
input_ids = []
|
283 |
+
special_tokens_masks = []
|
284 |
+
for token_ids in token_ids_list:
|
285 |
+
encoding = self.tokenizer.prepare_for_model(
|
286 |
+
token_ids,
|
287 |
+
padding=PaddingStrategy.MAX_LENGTH,
|
288 |
+
max_length=max_length,
|
289 |
+
add_special_tokens=False
|
290 |
+
)
|
291 |
+
input_ids.append(encoding['input_ids'])
|
292 |
+
attention_mask.append(encoding['attention_mask'])
|
293 |
+
special_tokens_masks.append(
|
294 |
+
self.tokenizer.get_special_tokens_mask(
|
295 |
+
encoding['input_ids'],
|
296 |
+
already_has_special_tokens=True
|
297 |
+
)
|
298 |
+
)
|
299 |
+
|
300 |
+
encoded = {
|
301 |
+
'input_ids': torch.tensor(input_ids),
|
302 |
+
'attention_mask': torch.tensor(attention_mask)
|
303 |
+
}
|
304 |
+
return encoded, special_tokens_masks, token_alignments
|
305 |
+
|
306 |
+
def tokenize(
|
307 |
+
self,
|
308 |
+
sent
|
309 |
+
):
|
310 |
+
"""Convert spacy sentence to huggingface tokens and compute the alignment"""
|
311 |
+
hf_tokens = []
|
312 |
+
token_alignment = []
|
313 |
+
for i, token in enumerate(sent):
|
314 |
+
# "Tokenize" each word individually, so as to track the alignment between spaCy/HF tokens
|
315 |
+
# Prefix all tokens with a space except the first one in the sentence
|
316 |
+
if i == 0:
|
317 |
+
token_text = token.text
|
318 |
+
else:
|
319 |
+
token_text = ' ' + token.text
|
320 |
+
start_hf_idx = len(hf_tokens)
|
321 |
+
word_tokens = self.tokenizer.tokenize(token_text)
|
322 |
+
end_hf_idx = len(hf_tokens) + len(word_tokens)
|
323 |
+
if end_hf_idx < self.max_length:
|
324 |
+
hf_tokens.extend(word_tokens)
|
325 |
+
hf_idxs = list(range(start_hf_idx, end_hf_idx))
|
326 |
+
else:
|
327 |
+
hf_idxs = None
|
328 |
+
token_alignment.append(hf_idxs)
|
329 |
+
return hf_tokens, token_alignment
|
330 |
+
|
331 |
+
|
332 |
+
def _split(data, sizes):
|
333 |
+
it = iter(data)
|
334 |
+
return [[next(it) for _ in range(size)] for size in sizes]
|
335 |
+
|
336 |
+
|
337 |
+
def _iter_len(it):
|
338 |
+
return sum(1 for _ in it)
|
339 |
+
|
340 |
+
# TODO set up batching
|
341 |
+
# To get top K axis and value per row: https://stackoverflow.com/questions/42832711/using-np-argpartition-to-index-values-in-a-multidimensional-array
|
342 |
+
|
343 |
+
|
344 |
+
def _ngrams(tokens, n):
|
345 |
+
for i in range(len(tokens) - n + 1):
|
346 |
+
yield tokens[i:i + n]
|
app.py
ADDED
@@ -0,0 +1,375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import operator
|
4 |
+
import os
|
5 |
+
import re
|
6 |
+
from pathlib import Path
|
7 |
+
|
8 |
+
import spacy
|
9 |
+
import streamlit as st
|
10 |
+
from robustnessgym import Dataset, Identifier
|
11 |
+
from robustnessgym import Spacy
|
12 |
+
from spacy.tokens import Doc
|
13 |
+
|
14 |
+
from align import NGramAligner, BertscoreAligner, StaticEmbeddingAligner
|
15 |
+
from components import MainView
|
16 |
+
from preprocessing import NGramAlignerCap, StaticEmbeddingAlignerCap, \
|
17 |
+
BertscoreAlignerCap
|
18 |
+
from preprocessing import _spacy_decode, _spacy_encode
|
19 |
+
from utils import preprocess_text
|
20 |
+
|
21 |
+
MIN_SEMANTIC_SIM_THRESHOLD = 0.1
|
22 |
+
MAX_SEMANTIC_SIM_TOP_K = 10
|
23 |
+
|
24 |
+
Doc.set_extension("name", default=None, force=True)
|
25 |
+
Doc.set_extension("column", default=None, force=True)
|
26 |
+
|
27 |
+
|
28 |
+
class Instance():
|
29 |
+
def __init__(self, id_, document, reference, preds, data=None):
|
30 |
+
self.id = id_
|
31 |
+
self.document = document
|
32 |
+
self.reference = reference
|
33 |
+
self.preds = preds
|
34 |
+
self.data = data
|
35 |
+
|
36 |
+
|
37 |
+
@st.cache(allow_output_mutation=True)
|
38 |
+
def load_from_index(filename, index):
|
39 |
+
with open(filename) as f:
|
40 |
+
for i, line in enumerate(f):
|
41 |
+
if i == index:
|
42 |
+
return json.loads(line.strip())
|
43 |
+
|
44 |
+
|
45 |
+
@st.cache(allow_output_mutation=True)
|
46 |
+
def load_dataset(path: str):
|
47 |
+
if path.endswith('.jsonl'):
|
48 |
+
return Dataset.from_jsonl(path)
|
49 |
+
try:
|
50 |
+
return Dataset.load_from_disk(path)
|
51 |
+
except NotADirectoryError:
|
52 |
+
return Dataset.from_jsonl(path)
|
53 |
+
|
54 |
+
|
55 |
+
@st.cache(allow_output_mutation=True)
|
56 |
+
def get_nlp():
|
57 |
+
os.popen('python -m spacy download en_core_web_sm').read()
|
58 |
+
try:
|
59 |
+
nlp = spacy.load("en_core_web_lg")
|
60 |
+
except:
|
61 |
+
nlp = spacy.load("en_core_web_sm")
|
62 |
+
is_lg = False
|
63 |
+
else:
|
64 |
+
is_lg = True
|
65 |
+
nlp.add_pipe('sentencizer', before="parser")
|
66 |
+
return nlp, is_lg
|
67 |
+
|
68 |
+
|
69 |
+
def retrieve(dataset, index, filename=None):
|
70 |
+
if index >= len(dataset):
|
71 |
+
st.error(f"Index {index} exceeds dataset length.")
|
72 |
+
|
73 |
+
eval_dataset = None
|
74 |
+
if filename:
|
75 |
+
# TODO Handle this through dedicated fields
|
76 |
+
if "cnn_dailymail" in filename:
|
77 |
+
eval_dataset = "cnndm"
|
78 |
+
elif "xsum" in filename:
|
79 |
+
eval_dataset = "xsum"
|
80 |
+
|
81 |
+
data = dataset[index]
|
82 |
+
id_ = data.get('id', '')
|
83 |
+
|
84 |
+
try:
|
85 |
+
document = rg_spacy.decode(
|
86 |
+
data[rg_spacy.identifier(columns=['preprocessed_document'])]
|
87 |
+
)
|
88 |
+
except KeyError:
|
89 |
+
if not is_lg:
|
90 |
+
st.error("'en_core_web_lg model' is required unless loading from cached file."
|
91 |
+
"To install: 'python -m spacy download en_core_web_lg'")
|
92 |
+
try:
|
93 |
+
text = data['document']
|
94 |
+
except KeyError:
|
95 |
+
text = data['article']
|
96 |
+
if not text:
|
97 |
+
st.error("Document is blank")
|
98 |
+
return
|
99 |
+
document = nlp(preprocess_text(text))
|
100 |
+
document._.name = "Document"
|
101 |
+
document._.column = "document"
|
102 |
+
|
103 |
+
try:
|
104 |
+
reference = rg_spacy.decode(
|
105 |
+
data[rg_spacy.identifier(columns=['preprocessed_summary:reference'])]
|
106 |
+
)
|
107 |
+
except KeyError:
|
108 |
+
if not is_lg:
|
109 |
+
st.error("'en_core_web_lg model' is required unless loading from cached file."
|
110 |
+
"To install: 'python -m spacy download en_core_web_lg'")
|
111 |
+
try:
|
112 |
+
text = data['summary'] if 'summary' in data else data['summary:reference']
|
113 |
+
except KeyError:
|
114 |
+
text = data.get('highlights')
|
115 |
+
if text:
|
116 |
+
reference = nlp(preprocess_text(text))
|
117 |
+
else:
|
118 |
+
reference = None
|
119 |
+
if reference is not None:
|
120 |
+
reference._.name = "Reference"
|
121 |
+
reference._.column = "summary:reference"
|
122 |
+
|
123 |
+
model_names = set()
|
124 |
+
for k in data:
|
125 |
+
m = re.match('(preprocessed_)?summary:(?P<model>.*)', k)
|
126 |
+
if m:
|
127 |
+
model_name = m.group('model')
|
128 |
+
if model_name != 'reference':
|
129 |
+
model_names.add(model_name)
|
130 |
+
|
131 |
+
preds = []
|
132 |
+
for model_name in model_names:
|
133 |
+
try:
|
134 |
+
pred = rg_spacy.decode(
|
135 |
+
data[rg_spacy.identifier(columns=[f"preprocessed_summary:{model_name}"])]
|
136 |
+
)
|
137 |
+
except KeyError:
|
138 |
+
if not is_lg:
|
139 |
+
st.error("'en_core_web_lg model' is required unless loading from cached file."
|
140 |
+
"To install: 'python -m spacy download en_core_web_lg'")
|
141 |
+
pred = nlp(preprocess_text(data[f"summary:{model_name}"]))
|
142 |
+
|
143 |
+
parts = model_name.split("-")
|
144 |
+
primary_sort = 0
|
145 |
+
if len(parts) == 2:
|
146 |
+
model, train_dataset = parts
|
147 |
+
if train_dataset == eval_dataset:
|
148 |
+
formatted_model_name = model.upper()
|
149 |
+
else:
|
150 |
+
formatted_model_name = f"{model.upper()} ({train_dataset.upper()}-trained)"
|
151 |
+
if train_dataset in ["xsum", "cnndm"]:
|
152 |
+
primary_sort = 1
|
153 |
+
else:
|
154 |
+
primary_sort = 2
|
155 |
+
else:
|
156 |
+
formatted_model_name = model_name.upper()
|
157 |
+
pred._.name = formatted_model_name
|
158 |
+
pred._.column = f"summary:{model_name}"
|
159 |
+
preds.append(
|
160 |
+
((primary_sort, formatted_model_name), pred)
|
161 |
+
)
|
162 |
+
|
163 |
+
preds = [pred for _, pred in sorted(preds)]
|
164 |
+
|
165 |
+
return Instance(
|
166 |
+
id_=id_,
|
167 |
+
document=document,
|
168 |
+
reference=reference,
|
169 |
+
preds=preds,
|
170 |
+
data=data,
|
171 |
+
)
|
172 |
+
|
173 |
+
|
174 |
+
def filter_alignment(alignment, threshold, top_k):
|
175 |
+
filtered_alignment = {}
|
176 |
+
for k, v in alignment.items():
|
177 |
+
filtered_matches = [(match_idx, score) for match_idx, score in v if score >= threshold]
|
178 |
+
if filtered_matches:
|
179 |
+
filtered_alignment[k] = sorted(filtered_matches, key=operator.itemgetter(1), reverse=True)[:top_k]
|
180 |
+
return filtered_alignment
|
181 |
+
|
182 |
+
|
183 |
+
def select_comparison(example):
|
184 |
+
all_summaries = []
|
185 |
+
|
186 |
+
if example.reference:
|
187 |
+
all_summaries.append(example.reference)
|
188 |
+
if example.preds:
|
189 |
+
all_summaries.extend(example.preds)
|
190 |
+
|
191 |
+
from_documents = [example.document]
|
192 |
+
if example.reference:
|
193 |
+
from_documents.append(example.reference)
|
194 |
+
document_names = [document._.name for document in from_documents]
|
195 |
+
select_document_name = sidebar_placeholder_from.selectbox(
|
196 |
+
label="Comparison FROM:",
|
197 |
+
options=document_names
|
198 |
+
)
|
199 |
+
document_index = document_names.index(select_document_name)
|
200 |
+
selected_document = from_documents[document_index]
|
201 |
+
|
202 |
+
remaining_summaries = [summary for summary in all_summaries if
|
203 |
+
summary._.name != selected_document._.name]
|
204 |
+
remaining_summary_names = [summary._.name for summary in remaining_summaries]
|
205 |
+
|
206 |
+
selected_summary_names = sidebar_placeholder_to.multiselect(
|
207 |
+
'Comparison TO:',
|
208 |
+
remaining_summary_names,
|
209 |
+
remaining_summary_names
|
210 |
+
)
|
211 |
+
selected_summaries = []
|
212 |
+
for summary_name in selected_summary_names:
|
213 |
+
summary_index = remaining_summary_names.index(summary_name)
|
214 |
+
selected_summaries.append(remaining_summaries[summary_index])
|
215 |
+
return selected_document, selected_summaries
|
216 |
+
|
217 |
+
|
218 |
+
def show_main(example):
|
219 |
+
# Get user input
|
220 |
+
|
221 |
+
semantic_sim_type = st.sidebar.radio(
|
222 |
+
"Semantic similarity type:",
|
223 |
+
["Contextual embedding", "Static embedding"]
|
224 |
+
)
|
225 |
+
semantic_sim_threshold = st.sidebar.slider(
|
226 |
+
"Semantic similarity threshold:",
|
227 |
+
min_value=MIN_SEMANTIC_SIM_THRESHOLD,
|
228 |
+
max_value=1.0,
|
229 |
+
step=0.1,
|
230 |
+
value=0.2,
|
231 |
+
)
|
232 |
+
semantic_sim_top_k = st.sidebar.slider(
|
233 |
+
"Semantic similarity top-k:",
|
234 |
+
min_value=1,
|
235 |
+
max_value=MAX_SEMANTIC_SIM_TOP_K,
|
236 |
+
step=1,
|
237 |
+
value=10,
|
238 |
+
)
|
239 |
+
|
240 |
+
document, summaries = select_comparison(example)
|
241 |
+
layout = st.sidebar.radio("Layout:", ["Vertical", "Horizontal"]).lower()
|
242 |
+
# if layout == "horizontal":
|
243 |
+
# scroll = st.sidebar.checkbox(label="Scroll sections", value=True)
|
244 |
+
# else:
|
245 |
+
scroll = True
|
246 |
+
gray_out_stopwords = st.sidebar.checkbox(label="Gray out stopwords", value=True)
|
247 |
+
|
248 |
+
# Gather data
|
249 |
+
try:
|
250 |
+
lexical_alignments = [
|
251 |
+
NGramAlignerCap.decode(
|
252 |
+
example.data[
|
253 |
+
Identifier(NGramAlignerCap.__name__)(
|
254 |
+
columns=[
|
255 |
+
f'preprocessed_{document._.column}',
|
256 |
+
f'preprocessed_{summary._.column}',
|
257 |
+
]
|
258 |
+
)
|
259 |
+
])[0]
|
260 |
+
for summary in summaries
|
261 |
+
]
|
262 |
+
lexical_alignments = [
|
263 |
+
{k: [(pair[0], int(pair[1])) for pair in v]
|
264 |
+
for k, v in d.items()}
|
265 |
+
for d in lexical_alignments
|
266 |
+
]
|
267 |
+
except KeyError:
|
268 |
+
lexical_alignments = NGramAligner().align(document, summaries)
|
269 |
+
|
270 |
+
if semantic_sim_type == "Static embedding":
|
271 |
+
try:
|
272 |
+
semantic_alignments = [
|
273 |
+
StaticEmbeddingAlignerCap.decode(
|
274 |
+
example.data[
|
275 |
+
Identifier(StaticEmbeddingAlignerCap.__name__)(
|
276 |
+
threshold=MIN_SEMANTIC_SIM_THRESHOLD,
|
277 |
+
top_k=MAX_SEMANTIC_SIM_TOP_K,
|
278 |
+
columns=[
|
279 |
+
f'preprocessed_{document._.column}',
|
280 |
+
f'preprocessed_{summary._.column}',
|
281 |
+
]
|
282 |
+
)
|
283 |
+
])[0]
|
284 |
+
for summary in summaries
|
285 |
+
]
|
286 |
+
except KeyError:
|
287 |
+
semantic_alignments = StaticEmbeddingAligner(
|
288 |
+
semantic_sim_threshold,
|
289 |
+
semantic_sim_top_k).align(
|
290 |
+
document,
|
291 |
+
summaries
|
292 |
+
)
|
293 |
+
else:
|
294 |
+
semantic_alignments = [
|
295 |
+
filter_alignment(alignment, semantic_sim_threshold, semantic_sim_top_k)
|
296 |
+
for alignment in semantic_alignments
|
297 |
+
]
|
298 |
+
else:
|
299 |
+
try:
|
300 |
+
semantic_alignments = [
|
301 |
+
BertscoreAlignerCap.decode(
|
302 |
+
example.data[
|
303 |
+
Identifier(BertscoreAlignerCap.__name__)(
|
304 |
+
threshold=MIN_SEMANTIC_SIM_THRESHOLD,
|
305 |
+
top_k=MAX_SEMANTIC_SIM_TOP_K,
|
306 |
+
columns=[
|
307 |
+
f'preprocessed_{document._.column}',
|
308 |
+
f'preprocessed_{summary._.column}',
|
309 |
+
]
|
310 |
+
)
|
311 |
+
])[0]
|
312 |
+
for summary in summaries
|
313 |
+
]
|
314 |
+
except KeyError:
|
315 |
+
semantic_alignments = BertscoreAligner(semantic_sim_threshold,
|
316 |
+
semantic_sim_top_k).align(document,
|
317 |
+
summaries)
|
318 |
+
else:
|
319 |
+
semantic_alignments = [
|
320 |
+
filter_alignment(alignment, semantic_sim_threshold, semantic_sim_top_k)
|
321 |
+
for alignment in semantic_alignments
|
322 |
+
]
|
323 |
+
|
324 |
+
MainView(
|
325 |
+
document,
|
326 |
+
summaries,
|
327 |
+
semantic_alignments,
|
328 |
+
lexical_alignments,
|
329 |
+
layout,
|
330 |
+
scroll,
|
331 |
+
gray_out_stopwords,
|
332 |
+
).show(height=720)
|
333 |
+
|
334 |
+
|
335 |
+
if __name__ == "__main__":
|
336 |
+
|
337 |
+
st.set_page_config(layout="wide")
|
338 |
+
|
339 |
+
parser = argparse.ArgumentParser()
|
340 |
+
parser.add_argument('--path', type=str, default='data')
|
341 |
+
parser.add_argument('--file', type=str)
|
342 |
+
args = parser.parse_args()
|
343 |
+
|
344 |
+
nlp, is_lg = get_nlp()
|
345 |
+
|
346 |
+
Spacy.encode = _spacy_encode
|
347 |
+
Spacy.decode = _spacy_decode
|
348 |
+
rg_spacy = Spacy(nlp=nlp)
|
349 |
+
|
350 |
+
path = Path(args.path)
|
351 |
+
all_files = set(map(os.path.basename, path.glob('*')))
|
352 |
+
files = sorted([
|
353 |
+
fname for fname in all_files if not (fname.endswith(".py") or fname.startswith("."))
|
354 |
+
])
|
355 |
+
if args.file:
|
356 |
+
try:
|
357 |
+
file_index = files.index(args.input)
|
358 |
+
except:
|
359 |
+
raise FileNotFoundError(f"File not found: {args.input}")
|
360 |
+
else:
|
361 |
+
file_index = 0
|
362 |
+
col1, col2 = st.beta_columns((3, 1))
|
363 |
+
filename = col1.selectbox(label="File:", options=files, index=file_index)
|
364 |
+
dataset = load_dataset(str(path / filename))
|
365 |
+
|
366 |
+
dataset_size = len(dataset)
|
367 |
+
query = col2.number_input(f"Index (Size: {dataset_size}):", value=0, min_value=0, max_value=dataset_size - 1)
|
368 |
+
|
369 |
+
sidebar_placeholder_from = st.sidebar.empty()
|
370 |
+
sidebar_placeholder_to = st.sidebar.empty()
|
371 |
+
|
372 |
+
if query is not None:
|
373 |
+
example = retrieve(dataset, query, filename)
|
374 |
+
if example:
|
375 |
+
show_main(example)
|
components.py
ADDED
@@ -0,0 +1,563 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from collections import defaultdict
|
3 |
+
from itertools import count
|
4 |
+
from operator import itemgetter
|
5 |
+
from pathlib import Path
|
6 |
+
from typing import Dict, Optional
|
7 |
+
from typing import List, Tuple, Union
|
8 |
+
|
9 |
+
import htbuilder
|
10 |
+
from htbuilder import span, script, style, link, div, styles, HtmlElement
|
11 |
+
from htbuilder.units import px
|
12 |
+
from spacy.tokens import Doc
|
13 |
+
|
14 |
+
import streamlit as st
|
15 |
+
|
16 |
+
palette = [
|
17 |
+
"#66c2a5",
|
18 |
+
"#fc8d62",
|
19 |
+
"#8da0cb",
|
20 |
+
"#e78ac3",
|
21 |
+
"#a6d854",
|
22 |
+
"#ffd92f",
|
23 |
+
"#e5c494",
|
24 |
+
"#b3b3b3",
|
25 |
+
]
|
26 |
+
inactive_color = "#BBB"
|
27 |
+
|
28 |
+
|
29 |
+
def local_stylesheet(path):
|
30 |
+
with open(path) as f:
|
31 |
+
css = f.read()
|
32 |
+
return style()(
|
33 |
+
css
|
34 |
+
)
|
35 |
+
|
36 |
+
|
37 |
+
def remote_stylesheet(url):
|
38 |
+
return link(
|
39 |
+
href=url
|
40 |
+
)
|
41 |
+
|
42 |
+
|
43 |
+
def local_script(path):
|
44 |
+
with open(path) as f:
|
45 |
+
code = f.read()
|
46 |
+
return script()(
|
47 |
+
code
|
48 |
+
)
|
49 |
+
|
50 |
+
|
51 |
+
def remote_script(url):
|
52 |
+
return script(
|
53 |
+
src=url
|
54 |
+
)
|
55 |
+
|
56 |
+
|
57 |
+
def get_color(sent_idx):
|
58 |
+
return palette[sent_idx % len(palette)]
|
59 |
+
|
60 |
+
|
61 |
+
def hex_to_rgb(hex):
|
62 |
+
hex = hex.replace("#", '')
|
63 |
+
return tuple(int(hex[i:i + 2], 16) for i in (0, 2, 4))
|
64 |
+
|
65 |
+
|
66 |
+
def color_with_opacity(hex_color, opacity):
|
67 |
+
rgb = hex_to_rgb(hex_color)
|
68 |
+
return f"rgba({rgb[0]},{rgb[1]},{rgb[2]},{opacity:.2f})"
|
69 |
+
|
70 |
+
|
71 |
+
class Component:
|
72 |
+
|
73 |
+
def show(self, width=None, height=None, scrolling=True, **kwargs):
|
74 |
+
out = div(style=styles(
|
75 |
+
**kwargs
|
76 |
+
))(self.html())
|
77 |
+
html = str(out)
|
78 |
+
st.components.v1.html(html, width=width, height=height, scrolling=scrolling)
|
79 |
+
|
80 |
+
def html(self):
|
81 |
+
raise NotImplemented
|
82 |
+
|
83 |
+
|
84 |
+
class MainView(Component):
|
85 |
+
|
86 |
+
def __init__(
|
87 |
+
self,
|
88 |
+
document: Doc,
|
89 |
+
summaries: List[Doc],
|
90 |
+
semantic_alignments: Optional[List[Dict]],
|
91 |
+
lexical_alignments: Optional[List[Dict]],
|
92 |
+
layout: str,
|
93 |
+
scroll: bool,
|
94 |
+
gray_out_stopwords: bool
|
95 |
+
):
|
96 |
+
self.document = document
|
97 |
+
self.summaries = summaries
|
98 |
+
self.semantic_alignments = semantic_alignments
|
99 |
+
self.lexical_alignments = lexical_alignments
|
100 |
+
self.layout = layout
|
101 |
+
self.scroll = scroll
|
102 |
+
self.gray_out_stopwords = gray_out_stopwords
|
103 |
+
|
104 |
+
def html(self):
|
105 |
+
|
106 |
+
# Add document elements
|
107 |
+
if self.document._.name == 'Document':
|
108 |
+
document_name = 'Source Document'
|
109 |
+
else:
|
110 |
+
document_name = self.document._.name + ' summary'
|
111 |
+
doc_header = div(
|
112 |
+
id_="document-header"
|
113 |
+
)(
|
114 |
+
document_name
|
115 |
+
)
|
116 |
+
doc_elements = []
|
117 |
+
|
118 |
+
# Add document content, which comprises multiple elements, one for each summary. Only the elment corresponding to
|
119 |
+
# selected summary will be visible.
|
120 |
+
|
121 |
+
mu = MultiUnderline()
|
122 |
+
|
123 |
+
for summary_idx, summary in enumerate(self.summaries):
|
124 |
+
token_idx_to_sent_idx = {}
|
125 |
+
for sent_idx, sent in enumerate(summary.sents):
|
126 |
+
for token in sent:
|
127 |
+
token_idx_to_sent_idx[token.i] = sent_idx
|
128 |
+
is_selected_summary = (summary_idx == 0) # By default, first summary is selected
|
129 |
+
|
130 |
+
if self.semantic_alignments is not None:
|
131 |
+
doc_token_idx_to_matches = defaultdict(list)
|
132 |
+
semantic_alignment = self.semantic_alignments[summary_idx]
|
133 |
+
for summary_token_idx, matches in semantic_alignment.items():
|
134 |
+
for doc_token_idx, sim in matches:
|
135 |
+
doc_token_idx_to_matches[doc_token_idx].append((summary_token_idx, sim))
|
136 |
+
else:
|
137 |
+
doc_token_idx_to_matches = {}
|
138 |
+
|
139 |
+
token_elements = []
|
140 |
+
for doc_token_idx, doc_token in enumerate(self.document):
|
141 |
+
if doc_token.is_stop or doc_token.is_punct:
|
142 |
+
classes = ["stopword"]
|
143 |
+
if self.gray_out_stopwords:
|
144 |
+
classes.append("grayed-out")
|
145 |
+
el = span(
|
146 |
+
_class=" ".join(classes)
|
147 |
+
)(
|
148 |
+
doc_token.text
|
149 |
+
)
|
150 |
+
|
151 |
+
else:
|
152 |
+
matches = doc_token_idx_to_matches.get(doc_token_idx)
|
153 |
+
if matches:
|
154 |
+
summary_token_idx, sim = max(matches, key=itemgetter(1))
|
155 |
+
sent_idx = token_idx_to_sent_idx[summary_token_idx]
|
156 |
+
color_primary = get_color(sent_idx)
|
157 |
+
highlight_color_primary = color_with_opacity(color_primary, sim)
|
158 |
+
props = {
|
159 |
+
'data-highlight-id': str(doc_token_idx),
|
160 |
+
'data-primary-color': highlight_color_primary
|
161 |
+
}
|
162 |
+
match_classes = []
|
163 |
+
for summary_token_idx, sim in matches:
|
164 |
+
sent_idx = token_idx_to_sent_idx[summary_token_idx]
|
165 |
+
match_classes.append(f"summary-highlight-{summary_idx}-{summary_token_idx}")
|
166 |
+
color = color_with_opacity(get_color(sent_idx), sim)
|
167 |
+
props[f"data-color-{summary_idx}-{summary_token_idx}"] = color
|
168 |
+
props["data-match-classes"] = " ".join(match_classes)
|
169 |
+
el = self._highlight(
|
170 |
+
doc_token.text,
|
171 |
+
highlight_color_primary,
|
172 |
+
color_primary,
|
173 |
+
match_classes + ["annotation-hidden"],
|
174 |
+
**props
|
175 |
+
)
|
176 |
+
else:
|
177 |
+
el = doc_token.text
|
178 |
+
token_elements.append(el)
|
179 |
+
|
180 |
+
spans = []
|
181 |
+
if self.lexical_alignments is not None:
|
182 |
+
lexical_alignment = self.lexical_alignments[summary_idx]
|
183 |
+
for summary_span, doc_spans in lexical_alignment.items():
|
184 |
+
summary_span_start, summary_span_end = summary_span
|
185 |
+
span_id = f"{summary_idx}-{summary_span_start}-{summary_span_end}"
|
186 |
+
sent_idx = token_idx_to_sent_idx[summary_span_start]
|
187 |
+
for doc_span_start, doc_span_end in doc_spans:
|
188 |
+
spans.append((
|
189 |
+
doc_span_start,
|
190 |
+
doc_span_end,
|
191 |
+
sent_idx,
|
192 |
+
get_color(sent_idx),
|
193 |
+
span_id
|
194 |
+
))
|
195 |
+
token_elements = mu.markup(token_elements, spans)
|
196 |
+
|
197 |
+
classes = ["main-doc", "bordered"]
|
198 |
+
if self.scroll:
|
199 |
+
classes.append("scroll")
|
200 |
+
|
201 |
+
main_doc = div(
|
202 |
+
_class=" ".join(classes)
|
203 |
+
)(
|
204 |
+
token_elements
|
205 |
+
),
|
206 |
+
|
207 |
+
classes = ["doc"]
|
208 |
+
if is_selected_summary:
|
209 |
+
classes.append("display")
|
210 |
+
else:
|
211 |
+
classes.append("nodisplay")
|
212 |
+
doc_elements.append(
|
213 |
+
div(
|
214 |
+
**{
|
215 |
+
"class": " ".join(classes),
|
216 |
+
"data-index": summary_idx
|
217 |
+
}
|
218 |
+
)(
|
219 |
+
main_doc,
|
220 |
+
div(_class="proxy-doc"),
|
221 |
+
div(_class="proxy-scroll")
|
222 |
+
)
|
223 |
+
)
|
224 |
+
|
225 |
+
summary_title = "Summary"
|
226 |
+
summary_header = div(
|
227 |
+
id_="summary-header"
|
228 |
+
)(
|
229 |
+
summary_title,
|
230 |
+
div(id="summary-header-gap"),
|
231 |
+
)
|
232 |
+
|
233 |
+
summary_items = []
|
234 |
+
for summary_idx, summary in enumerate(self.summaries):
|
235 |
+
token_idx_to_sent_idx = {}
|
236 |
+
for sent_idx, sent in enumerate(summary.sents):
|
237 |
+
for token in sent:
|
238 |
+
token_idx_to_sent_idx[token.i] = sent_idx
|
239 |
+
|
240 |
+
spans = []
|
241 |
+
matches_ngram = [False] * len(list(summary))
|
242 |
+
if self.lexical_alignments is not None:
|
243 |
+
lexical_alignment = self.lexical_alignments[summary_idx]
|
244 |
+
for summary_span in lexical_alignment.keys():
|
245 |
+
start, end = summary_span
|
246 |
+
matches_ngram[slice(start, end)] = [True] * (end - start)
|
247 |
+
span_id = f"{summary_idx}-{start}-{end}"
|
248 |
+
sent_idx = token_idx_to_sent_idx[start]
|
249 |
+
spans.append((
|
250 |
+
start,
|
251 |
+
end,
|
252 |
+
sent_idx,
|
253 |
+
get_color(sent_idx),
|
254 |
+
span_id
|
255 |
+
))
|
256 |
+
|
257 |
+
if self.semantic_alignments is not None:
|
258 |
+
semantic_alignment = self.semantic_alignments[summary_idx]
|
259 |
+
else:
|
260 |
+
semantic_alignment = {}
|
261 |
+
token_elements = []
|
262 |
+
for token_idx, token in enumerate(summary):
|
263 |
+
if token.is_stop or token.is_punct:
|
264 |
+
classes = ["stopword"]
|
265 |
+
if self.gray_out_stopwords:
|
266 |
+
classes.append("grayed-out")
|
267 |
+
el = span(
|
268 |
+
_class=" ".join(classes)
|
269 |
+
)(
|
270 |
+
token.text
|
271 |
+
)
|
272 |
+
else:
|
273 |
+
classes = []
|
274 |
+
if token.ent_iob_ in ('I', 'B'):
|
275 |
+
classes.append("entity")
|
276 |
+
if matches_ngram[token_idx]:
|
277 |
+
classes.append("matches-ngram")
|
278 |
+
matches = semantic_alignment.get(token_idx)
|
279 |
+
if matches:
|
280 |
+
top_match = max(matches, key=itemgetter(1))
|
281 |
+
top_sim = max(top_match[1], 0)
|
282 |
+
top_doc_token_idx = top_match[0]
|
283 |
+
props = {
|
284 |
+
"data-highlight-id": f"{summary_idx}-{token_idx}",
|
285 |
+
"data-top-doc-highlight-id": str(top_doc_token_idx),
|
286 |
+
"data-top-doc-sim": f"{top_sim:.2f}",
|
287 |
+
}
|
288 |
+
classes.extend([
|
289 |
+
"annotation-hidden",
|
290 |
+
f"summary-highlight-{summary_idx}-{token_idx}"
|
291 |
+
])
|
292 |
+
sent_idx = token_idx_to_sent_idx[token_idx]
|
293 |
+
el = self._highlight(
|
294 |
+
token.text,
|
295 |
+
color_with_opacity(get_color(sent_idx), top_sim),
|
296 |
+
color_with_opacity(get_color(sent_idx), 1),
|
297 |
+
classes,
|
298 |
+
**props
|
299 |
+
)
|
300 |
+
else:
|
301 |
+
if classes:
|
302 |
+
el = span(_class=" ".join(classes))(token.text)
|
303 |
+
else:
|
304 |
+
el = token.text
|
305 |
+
token_elements.append(el)
|
306 |
+
|
307 |
+
token_elements = mu.markup(token_elements, spans)
|
308 |
+
|
309 |
+
classes = ["summary-item"]
|
310 |
+
if summary_idx == 0: # Default is for first summary to be selected
|
311 |
+
classes.append("selected")
|
312 |
+
|
313 |
+
summary_items.append(
|
314 |
+
div(
|
315 |
+
**{"class": ' '.join(classes), "data-index": summary_idx}
|
316 |
+
)(
|
317 |
+
div(_class="name")(summary._.name),
|
318 |
+
div(_class="content")(token_elements)
|
319 |
+
)
|
320 |
+
)
|
321 |
+
classes = ["summary-list", "bordered"]
|
322 |
+
if self.scroll:
|
323 |
+
classes.append("scroll")
|
324 |
+
if self.lexical_alignments is not None:
|
325 |
+
classes.append("has-lexical-alignment")
|
326 |
+
if self.semantic_alignments is not None:
|
327 |
+
classes.append("has-semantic-alignment")
|
328 |
+
summary_list = div(
|
329 |
+
_class=" ".join(classes)
|
330 |
+
)(
|
331 |
+
summary_items
|
332 |
+
)
|
333 |
+
|
334 |
+
annotation_key = \
|
335 |
+
"""
|
336 |
+
<ul class="annotation-key">
|
337 |
+
<li class="annotation-key-label">Annotations:</li>
|
338 |
+
<li id="option-lexical" class="option selected">
|
339 |
+
<span class="annotation-key-ngram">N-Gram overlap</span>
|
340 |
+
</li>
|
341 |
+
<li id="option-semantic" class="option selected">
|
342 |
+
<span class="annotation-key-semantic">Semantic overlap</span>
|
343 |
+
</li>
|
344 |
+
<li id="option-novel" class="option selected">
|
345 |
+
<span class="annotation-key-novel">Novel words</span>
|
346 |
+
</li>
|
347 |
+
<li id="option-entity" class="option selected">
|
348 |
+
<span class="annotation-key-entity">Novel entities</span>
|
349 |
+
</li>
|
350 |
+
|
351 |
+
</ul>
|
352 |
+
"""
|
353 |
+
|
354 |
+
body = div(
|
355 |
+
annotation_key,
|
356 |
+
div(
|
357 |
+
_class=f"vis-container {self.layout}-layout"
|
358 |
+
)(
|
359 |
+
div(
|
360 |
+
_class="doc-container"
|
361 |
+
)(
|
362 |
+
doc_header,
|
363 |
+
*doc_elements
|
364 |
+
),
|
365 |
+
div(
|
366 |
+
_class="summary-container"
|
367 |
+
)(
|
368 |
+
summary_header,
|
369 |
+
summary_list
|
370 |
+
)
|
371 |
+
),
|
372 |
+
)
|
373 |
+
return [
|
374 |
+
"""<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">""",
|
375 |
+
local_stylesheet(Path(__file__).parent / "resources" / "summvis.css"),
|
376 |
+
"""<link rel="preconnect" href="https://fonts.gstatic.com">
|
377 |
+
<link href="https://fonts.googleapis.com/css2?family=Roboto:wght@400;500&display=swap" rel="stylesheet">""",
|
378 |
+
body,
|
379 |
+
"""<script
|
380 |
+
src="https://code.jquery.com/jquery-3.5.1.min.js"
|
381 |
+
integrity="sha256-9/aliU8dGd2tb6OSsuzixeV4y/faTqgFtohetphbbj0="
|
382 |
+
crossorigin="anonymous"></script>
|
383 |
+
<script src="https://cdn.jsdelivr.net/npm/bootstrap@4.6.0/dist/js/bootstrap.bundle.min.js"
|
384 |
+
integrity="sha384-Piv4xVNRyMGpqkS2by6br4gNJ7DXjqk09RmUpJ8jgGtD7zP9yug3goQfGII0yAns"
|
385 |
+
crossorigin="anonymous"></script>""",
|
386 |
+
local_script(Path(__file__).parent / "resources" / "jquery.color-2.1.2.min.js"),
|
387 |
+
local_script(Path(__file__).parent / "resources" / "summvis.js"),
|
388 |
+
"""<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/js/bootstrap.bundle.min.js" integrity="sha384-gtEjrD/SeCtmISkJkNUaaKMoLD0//ElJ19smozuHV6z3Iehds+3Ulb9Bn9Plx0x4" crossorigin="anonymous"></script>"""
|
389 |
+
]
|
390 |
+
|
391 |
+
def _highlight(
|
392 |
+
self,
|
393 |
+
token: Union[str, HtmlElement],
|
394 |
+
background_color,
|
395 |
+
dotted_underline_color,
|
396 |
+
classes: List[str],
|
397 |
+
**props
|
398 |
+
):
|
399 |
+
return span(
|
400 |
+
_class=" ".join(classes + ["highlight"]),
|
401 |
+
style=styles(
|
402 |
+
background_color=background_color,
|
403 |
+
border_bottom=f"4px dotted {dotted_underline_color}",
|
404 |
+
),
|
405 |
+
**props
|
406 |
+
)(token)
|
407 |
+
|
408 |
+
|
409 |
+
SPACE = " "
|
410 |
+
|
411 |
+
|
412 |
+
class MultiUnderline:
|
413 |
+
def __init__(
|
414 |
+
self,
|
415 |
+
underline_thickness=3,
|
416 |
+
underline_spacing=1
|
417 |
+
):
|
418 |
+
self.underline_thickness = underline_thickness
|
419 |
+
self.underline_spacing = underline_spacing
|
420 |
+
|
421 |
+
def markup(
|
422 |
+
self,
|
423 |
+
tokens: List[Union[str, HtmlElement]],
|
424 |
+
spans: List[Tuple[int, int, int, str, str]]
|
425 |
+
):
|
426 |
+
"""Style text with multiple layers of colored underlines.
|
427 |
+
Args:
|
428 |
+
tokens: list of tokens, either string or html element
|
429 |
+
spans: list of (start_pos, end_pos, rank, color, id) tuples defined as:
|
430 |
+
start_pos: start position of underline span
|
431 |
+
end_pos: end position of underline span
|
432 |
+
rank: rank for stacking order of underlines, all else being equal
|
433 |
+
color: color of underline
|
434 |
+
id: id of underline (encoded as a class label in resulting html element)
|
435 |
+
Returns:
|
436 |
+
List of HTML elements
|
437 |
+
"""
|
438 |
+
|
439 |
+
# Map from span start position to span
|
440 |
+
start_to_spans = defaultdict(list)
|
441 |
+
for span in spans:
|
442 |
+
start = span[0]
|
443 |
+
start_to_spans[start].append(span)
|
444 |
+
|
445 |
+
# Map from each underline slot position to list of active spans
|
446 |
+
slot_to_spans = {}
|
447 |
+
|
448 |
+
# Collection of html elements
|
449 |
+
elements = []
|
450 |
+
|
451 |
+
for pos, token in enumerate(tokens):
|
452 |
+
# Remove spans that are no longer active (end < pos)
|
453 |
+
slot_to_spans = defaultdict(
|
454 |
+
list,
|
455 |
+
{
|
456 |
+
slot: [span for span in spans if span[1] > pos] # span[1] contains end of spans
|
457 |
+
for slot, spans in slot_to_spans.items() if spans
|
458 |
+
}
|
459 |
+
)
|
460 |
+
|
461 |
+
# Add underlines to space between tokens for any continuing underlines
|
462 |
+
if pos > 0:
|
463 |
+
elements.append(self._get_underline_element(SPACE, slot_to_spans))
|
464 |
+
|
465 |
+
# Find slot for any new spans
|
466 |
+
new_spans = start_to_spans.pop(pos, None)
|
467 |
+
if new_spans:
|
468 |
+
new_spans.sort(
|
469 |
+
key=lambda span: (-(span[1] - span[0]), span[2])) # Sort by span length (reversed), rank
|
470 |
+
for new_span in new_spans:
|
471 |
+
# Find an existing slot or add a new one
|
472 |
+
for slot, spans in sorted(slot_to_spans.items(), key=itemgetter(0)): # Sort by slot index
|
473 |
+
if spans:
|
474 |
+
containing_span = spans[
|
475 |
+
0] # The first span in the slot strictly contains all other spans
|
476 |
+
containing_start, containing_end = containing_span[0:2]
|
477 |
+
containing_color = containing_span[3]
|
478 |
+
start, end = new_span[0:2]
|
479 |
+
color = new_span[3]
|
480 |
+
# If the new span (1) is strictly contained in this span, or (2) exactly matches this span
|
481 |
+
# and is the same color, then add span to this slot
|
482 |
+
if end <= containing_end and (
|
483 |
+
(start > containing_start or end < containing_end) or
|
484 |
+
(start == containing_start and end == containing_end and color == containing_color)
|
485 |
+
):
|
486 |
+
spans.append(new_span)
|
487 |
+
break
|
488 |
+
else:
|
489 |
+
# Find a new slot index to add the span
|
490 |
+
for slot_index in count():
|
491 |
+
spans = slot_to_spans[slot_index]
|
492 |
+
if not spans: # If slot is free, take it
|
493 |
+
spans.append(new_span)
|
494 |
+
break
|
495 |
+
|
496 |
+
# Add underlines to token for all active spans
|
497 |
+
elements.append(self._get_underline_element(token, slot_to_spans))
|
498 |
+
return elements
|
499 |
+
|
500 |
+
def _get_underline_element(self, token, slot_to_spans):
|
501 |
+
if not slot_to_spans:
|
502 |
+
return token
|
503 |
+
max_slot_index = max(slot_to_spans.keys())
|
504 |
+
element = token
|
505 |
+
for slot_index in range(max_slot_index + 1):
|
506 |
+
spans = slot_to_spans[slot_index]
|
507 |
+
if not spans:
|
508 |
+
color = "rgba(0, 0, 0, 0)" # Transparent element w/opacity=0
|
509 |
+
props = {}
|
510 |
+
else:
|
511 |
+
containing_slot = spans[0]
|
512 |
+
color = containing_slot[3]
|
513 |
+
classes = ["underline"]
|
514 |
+
if token != SPACE:
|
515 |
+
classes.append("token-underline")
|
516 |
+
classes.extend([f"span-{span[4]}" for span in spans]) # Encode ids in class names
|
517 |
+
props = {
|
518 |
+
"class": " ".join(classes),
|
519 |
+
"data-primary-color": color
|
520 |
+
}
|
521 |
+
if slot_index == 0:
|
522 |
+
padding_bottom = 0
|
523 |
+
else:
|
524 |
+
padding_bottom = self.underline_spacing
|
525 |
+
display = "inline-block"
|
526 |
+
element = htbuilder.span(
|
527 |
+
style=styles(
|
528 |
+
display=display,
|
529 |
+
border_bottom=f"{self.underline_thickness}px solid",
|
530 |
+
border_color=color,
|
531 |
+
padding_bottom=px(padding_bottom),
|
532 |
+
),
|
533 |
+
**props
|
534 |
+
)(element)
|
535 |
+
|
536 |
+
# Return outermost nested span
|
537 |
+
return element
|
538 |
+
|
539 |
+
|
540 |
+
if __name__ == "__main__":
|
541 |
+
from htbuilder import div
|
542 |
+
|
543 |
+
# Test
|
544 |
+
text = "The quick brown fox jumps"
|
545 |
+
tokens = text.split()
|
546 |
+
tokens = [
|
547 |
+
"The",
|
548 |
+
htbuilder.span(style=styles(color="red"))("quick"),
|
549 |
+
"brown",
|
550 |
+
"fox",
|
551 |
+
"jumps"
|
552 |
+
]
|
553 |
+
spans = [
|
554 |
+
(0, 2, 0, "green", "green1"),
|
555 |
+
(1, 3, 0, "orange", "orange1"),
|
556 |
+
(3, 4, 0, "red", "red1"),
|
557 |
+
(2, 4, 0, "blue", "blue1"),
|
558 |
+
(1, 5, 0, "orange", "orange1"),
|
559 |
+
]
|
560 |
+
|
561 |
+
mu = MultiUnderline()
|
562 |
+
html = str(div(mu.markup(tokens, spans)))
|
563 |
+
print(html)
|
data/10:cnn_dailymail_1000.validation/_dataset/data.gz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a67d16f8fade54f8e2d525ea80a5dd44af5b7070811e76ca0b1d281931505f8e
|
3 |
+
size 679738
|
data/10:cnn_dailymail_1000.validation/metadata.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"interactions": "{\"cachedoperations\": \"{\\\"[\\\\\\\"{\\\\\\\\\\\\\\\"_name\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"Spacy\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"_index\\\\\\\\\\\\\\\": null, \\\\\\\\\\\\\\\"_parameters\\\\\\\\\\\\\\\": {\\\\\\\\\\\\\\\"lang\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"en\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"pipeline\\\\\\\\\\\\\\\": [\\\\\\\\\\\\\\\"tok2vec\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"tagger\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"sentencizer\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"parser\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"ner\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"attribute_ruler\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"lemmatizer\\\\\\\\\\\\\\\"]}}\\\\\\\", \\\\\\\"preprocessed_summary:bart-cnndm\\\\\\\"]\\\": 2, \\\"[\\\\\\\"{\\\\\\\\\\\\\\\"_name\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"Spacy\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"_index\\\\\\\\\\\\\\\": null, \\\\\\\\\\\\\\\"_parameters\\\\\\\\\\\\\\\": {\\\\\\\\\\\\\\\"lang\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"en\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"pipeline\\\\\\\\\\\\\\\": [\\\\\\\\\\\\\\\"tok2vec\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"tagger\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"sentencizer\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"parser\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"ner\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"attribute_ruler\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"lemmatizer\\\\\\\\\\\\\\\"]}}\\\\\\\", \\\\\\\"preprocessed_summary:bart-xsum\\\\\\\"]\\\": 3, \\\"[\\\\\\\"{\\\\\\\\\\\\\\\"_name\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"Spacy\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"_index\\\\\\\\\\\\\\\": null, \\\\\\\\\\\\\\\"_parameters\\\\\\\\\\\\\\\": {\\\\\\\\\\\\\\\"lang\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"en\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"pipeline\\\\\\\\\\\\\\\": [\\\\\\\\\\\\\\\"tok2vec\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"tagger\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"sentencizer\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"parser\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"ner\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"attribute_ruler\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"lemmatizer\\\\\\\\\\\\\\\"]}}\\\\\\\", \\\\\\\"preprocessed_summary:pegasus-cnndm\\\\\\\"]\\\": 4, \\\"[\\\\\\\"{\\\\\\\\\\\\\\\"_name\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"Spacy\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"_index\\\\\\\\\\\\\\\": null, \\\\\\\\\\\\\\\"_parameters\\\\\\\\\\\\\\\": {\\\\\\\\\\\\\\\"lang\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"en\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"pipeline\\\\\\\\\\\\\\\": [\\\\\\\\\\\\\\\"tok2vec\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"tagger\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"sentencizer\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"parser\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"ner\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"attribute_ruler\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"lemmatizer\\\\\\\\\\\\\\\"]}}\\\\\\\", \\\\\\\"preprocessed_summary:pegasus-multinews\\\\\\\"]\\\": 5, \\\"[\\\\\\\"{\\\\\\\\\\\\\\\"_name\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"Spacy\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"_index\\\\\\\\\\\\\\\": null, \\\\\\\\\\\\\\\"_parameters\\\\\\\\\\\\\\\": {\\\\\\\\\\\\\\\"lang\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"en\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"pipeline\\\\\\\\\\\\\\\": [\\\\\\\\\\\\\\\"tok2vec\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"tagger\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"sentencizer\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"parser\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"ner\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"attribute_ruler\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"lemmatizer\\\\\\\\\\\\\\\"]}}\\\\\\\", \\\\\\\"preprocessed_summary:pegasus-newsroom\\\\\\\"]\\\": 6, \\\"[\\\\\\\"{\\\\\\\\\\\\\\\"_name\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"Spacy\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"_index\\\\\\\\\\\\\\\": null, \\\\\\\\\\\\\\\"_parameters\\\\\\\\\\\\\\\": {\\\\\\\\\\\\\\\"lang\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"en\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"pipeline\\\\\\\\\\\\\\\": [\\\\\\\\\\\\\\\"tok2vec\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"tagger\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"sentencizer\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"parser\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"ner\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"attribute_ruler\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"lemmatizer\\\\\\\\\\\\\\\"]}}\\\\\\\", \\\\\\\"preprocessed_summary:pegasus-xsum\\\\\\\"]\\\": 7, \\\"[\\\\\\\"{\\\\\\\\\\\\\\\"_name\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"Spacy\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"_index\\\\\\\\\\\\\\\": null, \\\\\\\\\\\\\\\"_parameters\\\\\\\\\\\\\\\": {\\\\\\\\\\\\\\\"lang\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"en\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"pipeline\\\\\\\\\\\\\\\": [\\\\\\\\\\\\\\\"tok2vec\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"tagger\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"sentencizer\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"parser\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"ner\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"attribute_ruler\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"lemmatizer\\\\\\\\\\\\\\\"]}}\\\\\\\", \\\\\\\"preprocessed_document\\\\\\\"]\\\": 6, \\\"[\\\\\\\"{\\\\\\\\\\\\\\\"_name\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"Spacy\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"_index\\\\\\\\\\\\\\\": null, \\\\\\\\\\\\\\\"_parameters\\\\\\\\\\\\\\\": {\\\\\\\\\\\\\\\"lang\\\\\\\\\\\\\\\": \\\\\\\\\\\\\\\"en\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"pipeline\\\\\\\\\\\\\\\": [\\\\\\\\\\\\\\\"tok2vec\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"tagger\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"sentencizer\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"parser\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"ner\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"attribute_ruler\\\\\\\\\\\\\\\", \\\\\\\\\\\\\\\"lemmatizer\\\\\\\\\\\\\\\"]}}\\\\\\\", \\\\\\\"preprocessed_summary:reference\\\\\\\"]\\\": 7}\", \"slicebuilders\": {\"subpopulation\": \"{}\", \"transformation\": \"{}\", \"attack\": \"{}\"}}", "_identifier": "{\"_name\": \"RGDataset\", \"_index\": null, \"_parameters\": {\"jsonl\": \"preloading/cnn_dailymail_1000.validation.predictions.jsonl\"}}", "_dataset_fmt": "in_memory"}
|
generation.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Script for decoding summarization models available through Huggingface Transformers.
|
3 |
+
|
4 |
+
Usage with Huggingface Datasets:
|
5 |
+
python generation.py --model <model name> --data_path <path to data in jsonl format>
|
6 |
+
|
7 |
+
Usage with custom datasets in JSONL format:
|
8 |
+
python generation.py --model <model name> --dataset <dataset name> --split <data split>
|
9 |
+
"""
|
10 |
+
#!/usr/bin/env python
|
11 |
+
# coding: utf-8
|
12 |
+
|
13 |
+
import argparse
|
14 |
+
import json
|
15 |
+
import os
|
16 |
+
|
17 |
+
import torch
|
18 |
+
|
19 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
20 |
+
from datasets import load_dataset
|
21 |
+
from tqdm import tqdm
|
22 |
+
|
23 |
+
BATCH_SIZE = 8
|
24 |
+
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
25 |
+
|
26 |
+
BART_CNNDM_CHECKPOINT = 'facebook/bart-large-cnn'
|
27 |
+
BART_XSUM_CHECKPOINT = 'facebook/bart-large-xsum'
|
28 |
+
PEGASUS_CNNDM_CHECKPOINT = 'google/pegasus-cnn_dailymail'
|
29 |
+
PEGASUS_XSUM_CHECKPOINT = 'google/pegasus-xsum'
|
30 |
+
PEGASUS_NEWSROOM_CHECKPOINT = 'google/pegasus-newsroom'
|
31 |
+
PEGASUS_MULTINEWS_CHECKPOINT = 'google/pegasus-multi_news'
|
32 |
+
|
33 |
+
MODEL_CHECKPOINTS = {
|
34 |
+
'bart-xsum': BART_XSUM_CHECKPOINT,
|
35 |
+
'bart-cnndm': BART_CNNDM_CHECKPOINT,
|
36 |
+
'pegasus-xsum': PEGASUS_XSUM_CHECKPOINT,
|
37 |
+
'pegasus-cnndm': PEGASUS_CNNDM_CHECKPOINT,
|
38 |
+
'pegasus-newsroom': PEGASUS_NEWSROOM_CHECKPOINT,
|
39 |
+
'pegasus-multinews': PEGASUS_MULTINEWS_CHECKPOINT
|
40 |
+
}
|
41 |
+
|
42 |
+
|
43 |
+
class JSONDataset(torch.utils.data.Dataset):
|
44 |
+
def __init__(self, data_path):
|
45 |
+
super(JSONDataset, self).__init__()
|
46 |
+
|
47 |
+
with open(data_path) as fd:
|
48 |
+
self.data = [json.loads(line) for line in fd]
|
49 |
+
|
50 |
+
def __len__(self):
|
51 |
+
return len(self.data)
|
52 |
+
|
53 |
+
def __getitem__(self, idx):
|
54 |
+
return self.data[idx]
|
55 |
+
|
56 |
+
|
57 |
+
def preprocess_data(raw_data, dataset):
|
58 |
+
"""
|
59 |
+
Unify format of Huggingface Datastes
|
60 |
+
|
61 |
+
:param raw_data: loaded data
|
62 |
+
:param dataset: name of dataset
|
63 |
+
"""
|
64 |
+
if dataset == 'xsum':
|
65 |
+
raw_data['article'] = raw_data['document']
|
66 |
+
raw_data['target'] = raw_data['summary']
|
67 |
+
del raw_data['document']
|
68 |
+
del raw_data['summary']
|
69 |
+
elif dataset == 'cnndm':
|
70 |
+
raw_data['target'] = raw_data['highlights']
|
71 |
+
del raw_data['highlights']
|
72 |
+
elif dataset == 'gigaword':
|
73 |
+
raw_data['article'] = raw_data['document']
|
74 |
+
raw_data['target'] = raw_data['summary']
|
75 |
+
del raw_data['document']
|
76 |
+
del raw_data['summary']
|
77 |
+
|
78 |
+
return raw_data
|
79 |
+
|
80 |
+
|
81 |
+
def postprocess_data(raw_data, decoded):
|
82 |
+
"""
|
83 |
+
Remove generation artifacts and postprocess outputs
|
84 |
+
|
85 |
+
:param raw_data: loaded data
|
86 |
+
:param decoded: model outputs
|
87 |
+
"""
|
88 |
+
raw_data['target'] = [x.replace('\n', ' ') for x in raw_data['target']]
|
89 |
+
raw_data['decoded'] = [x.replace('<n>', ' ') for x in decoded]
|
90 |
+
|
91 |
+
return [dict(zip(raw_data, t)) for t in zip(*raw_data.values())]
|
92 |
+
|
93 |
+
|
94 |
+
if __name__ == '__main__':
|
95 |
+
parser = argparse.ArgumentParser(description='Process some integers.')
|
96 |
+
parser.add_argument('--model', type=str, required=True, choices=['bart-xsum', 'bart-cnndm', 'pegasus-xsum', 'pegasus-cnndm', 'pegasus-newsroom', 'pegasus-multinews'])
|
97 |
+
parser.add_argument('--data_path', type=str)
|
98 |
+
parser.add_argument('--dataset', type=str, choices=['xsum', 'cnndm', 'gigaword'])
|
99 |
+
parser.add_argument('--split', type=str, choices=['train', 'validation', 'test'])
|
100 |
+
args = parser.parse_args()
|
101 |
+
|
102 |
+
if args.dataset and not args.split:
|
103 |
+
raise RuntimeError('If `dataset` flag is specified `split` must also be provided.')
|
104 |
+
|
105 |
+
if args.data_path:
|
106 |
+
args.dataset = os.path.splitext(os.path.basename(args.data_path))[0]
|
107 |
+
args.split = 'user'
|
108 |
+
|
109 |
+
# Load models & data
|
110 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINTS[args.model]).to(DEVICE)
|
111 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINTS[args.model])
|
112 |
+
|
113 |
+
if not args.data_path:
|
114 |
+
if args.dataset == 'cnndm':
|
115 |
+
dataset = load_dataset('cnn_dailymail', '3.0.0', split=args.split)
|
116 |
+
elif args.dataset =='xsum':
|
117 |
+
dataset = load_dataset('xsum', split=args.split)
|
118 |
+
elif args.dataset =='gigaword':
|
119 |
+
dataset = load_dataset('gigaword', split=args.split)
|
120 |
+
else:
|
121 |
+
dataset = JSONDataset(args.data_path)
|
122 |
+
|
123 |
+
dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE)
|
124 |
+
|
125 |
+
# Run validation
|
126 |
+
filename = '%s.%s.%s.results' % (args.model.replace("/", "-"), args.dataset, args.split)
|
127 |
+
fd_out = open(filename, 'w')
|
128 |
+
|
129 |
+
results = []
|
130 |
+
model.eval()
|
131 |
+
with torch.no_grad():
|
132 |
+
for raw_data in tqdm(dataloader):
|
133 |
+
raw_data = preprocess_data(raw_data, args.dataset)
|
134 |
+
batch = tokenizer(raw_data["article"], return_tensors="pt", truncation=True, padding="longest").to(DEVICE)
|
135 |
+
summaries = model.generate(input_ids=batch.input_ids, attention_mask=batch.attention_mask)
|
136 |
+
|
137 |
+
decoded = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
138 |
+
result = postprocess_data(raw_data, decoded)
|
139 |
+
results.extend(result)
|
140 |
+
|
141 |
+
for example in result:
|
142 |
+
fd_out.write(json.dumps(example) + '\n')
|
preprocessing.py
ADDED
@@ -0,0 +1,701 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
from argparse import ArgumentParser
|
4 |
+
from ast import literal_eval
|
5 |
+
from types import SimpleNamespace
|
6 |
+
from typing import List
|
7 |
+
|
8 |
+
from robustnessgym import Dataset, Spacy, CachedOperation
|
9 |
+
from robustnessgym.core.constants import CACHEDOPS
|
10 |
+
from robustnessgym.core.tools import strings_as_json
|
11 |
+
from robustnessgym.logging.utils import set_logging_level
|
12 |
+
from spacy import load
|
13 |
+
from spacy.attrs import DEP, IS_ALPHA, IS_PUNCT, IS_STOP, LEMMA, LOWER, TAG, SENT_END, \
|
14 |
+
SENT_START, ORTH, POS, ENT_IOB
|
15 |
+
from spacy.tokens import Doc
|
16 |
+
|
17 |
+
from align import BertscoreAligner, NGramAligner, StaticEmbeddingAligner
|
18 |
+
from utils import preprocess_text
|
19 |
+
|
20 |
+
set_logging_level('critical')
|
21 |
+
logger = logging.getLogger(__name__)
|
22 |
+
logger.setLevel(logging.CRITICAL)
|
23 |
+
|
24 |
+
|
25 |
+
def _spacy_encode(self, x):
|
26 |
+
arr = x.to_array(
|
27 |
+
[DEP, IS_ALPHA, IS_PUNCT, IS_STOP, LEMMA, LOWER, TAG, SENT_END, SENT_START,
|
28 |
+
ORTH, POS, ENT_IOB])
|
29 |
+
return {
|
30 |
+
'arr': arr.flatten(),
|
31 |
+
'shape': list(arr.shape),
|
32 |
+
'words': [t.text for t in x]
|
33 |
+
}
|
34 |
+
|
35 |
+
|
36 |
+
def _spacy_decode(self, x):
|
37 |
+
doc = Doc(self.nlp.vocab, words=x['words'])
|
38 |
+
return doc.from_array(
|
39 |
+
[DEP, IS_ALPHA, IS_PUNCT, IS_STOP, LEMMA, LOWER,
|
40 |
+
TAG, SENT_END, SENT_START, ORTH, POS, ENT_IOB],
|
41 |
+
x['arr'].reshape(x['shape'])
|
42 |
+
)
|
43 |
+
|
44 |
+
|
45 |
+
Spacy.encode = _spacy_encode
|
46 |
+
Spacy.decode = _spacy_decode
|
47 |
+
|
48 |
+
|
49 |
+
class AlignerCap(CachedOperation):
|
50 |
+
def __init__(
|
51 |
+
self,
|
52 |
+
aligner,
|
53 |
+
spacy,
|
54 |
+
**kwargs,
|
55 |
+
):
|
56 |
+
super(AlignerCap, self).__init__(**kwargs)
|
57 |
+
self.spacy = spacy
|
58 |
+
self.aligner = aligner
|
59 |
+
|
60 |
+
@classmethod
|
61 |
+
def encode(cls, x):
|
62 |
+
# Convert to built-in types from np.int / np.float
|
63 |
+
return super(AlignerCap, cls).encode([
|
64 |
+
{str(k): [(int(t[0]), float(t[1])) for t in v] for k, v in d.items()}
|
65 |
+
for d in x
|
66 |
+
])
|
67 |
+
|
68 |
+
@classmethod
|
69 |
+
def decode(cls, x):
|
70 |
+
x = super(AlignerCap, cls).decode(x)
|
71 |
+
x = [{literal_eval(k): v for k, v in d.items()} for d in x]
|
72 |
+
return x
|
73 |
+
|
74 |
+
def apply(self, batch, columns, *args, **kwargs):
|
75 |
+
# Run the aligner on the first example of the batch
|
76 |
+
return [
|
77 |
+
self.aligner.align(
|
78 |
+
self.spacy.retrieve(batch, columns[0])[0],
|
79 |
+
[self.spacy.retrieve(batch, col)[0] for col in columns[1:]]
|
80 |
+
if len(columns) > 2 else
|
81 |
+
[self.spacy.retrieve(batch, columns[1])[0]],
|
82 |
+
)
|
83 |
+
]
|
84 |
+
|
85 |
+
|
86 |
+
class BertscoreAlignerCap(AlignerCap):
|
87 |
+
def __init__(
|
88 |
+
self,
|
89 |
+
threshold: float,
|
90 |
+
top_k: int,
|
91 |
+
spacy,
|
92 |
+
):
|
93 |
+
super(BertscoreAlignerCap, self).__init__(
|
94 |
+
aligner=BertscoreAligner(threshold=threshold, top_k=top_k),
|
95 |
+
spacy=spacy,
|
96 |
+
threshold=threshold,
|
97 |
+
top_k=top_k,
|
98 |
+
)
|
99 |
+
|
100 |
+
|
101 |
+
class NGramAlignerCap(AlignerCap):
|
102 |
+
def __init__(
|
103 |
+
self,
|
104 |
+
spacy,
|
105 |
+
):
|
106 |
+
super(NGramAlignerCap, self).__init__(
|
107 |
+
aligner=NGramAligner(),
|
108 |
+
spacy=spacy
|
109 |
+
)
|
110 |
+
|
111 |
+
|
112 |
+
class StaticEmbeddingAlignerCap(AlignerCap):
|
113 |
+
def __init__(
|
114 |
+
self,
|
115 |
+
threshold: float,
|
116 |
+
top_k: int,
|
117 |
+
spacy,
|
118 |
+
):
|
119 |
+
super(StaticEmbeddingAlignerCap, self).__init__(
|
120 |
+
aligner=StaticEmbeddingAligner(threshold=threshold, top_k=top_k),
|
121 |
+
spacy=spacy,
|
122 |
+
threshold=threshold,
|
123 |
+
top_k=top_k,
|
124 |
+
)
|
125 |
+
|
126 |
+
|
127 |
+
def _run_aligners(
|
128 |
+
dataset: Dataset,
|
129 |
+
aligners: List[CachedOperation],
|
130 |
+
doc_column: str,
|
131 |
+
reference_column: str,
|
132 |
+
summary_columns: List[str] = None,
|
133 |
+
):
|
134 |
+
if not summary_columns:
|
135 |
+
summary_columns = []
|
136 |
+
|
137 |
+
to_columns = []
|
138 |
+
if reference_column is not None:
|
139 |
+
to_columns.append(reference_column)
|
140 |
+
to_columns.extend(summary_columns)
|
141 |
+
|
142 |
+
for aligner in aligners:
|
143 |
+
|
144 |
+
# Run the aligner on (document, summary) pairs
|
145 |
+
|
146 |
+
dataset = aligner(
|
147 |
+
dataset,
|
148 |
+
[doc_column] + to_columns,
|
149 |
+
# Must use `batch_size = 1`
|
150 |
+
batch_size=1,
|
151 |
+
)
|
152 |
+
|
153 |
+
if reference_column is not None and len(summary_columns):
|
154 |
+
# Run the aligner on (reference, summary) pairs
|
155 |
+
dataset = aligner(
|
156 |
+
dataset,
|
157 |
+
[reference_column] + summary_columns,
|
158 |
+
# Must use `batch_size = 1`
|
159 |
+
batch_size=1,
|
160 |
+
)
|
161 |
+
|
162 |
+
if len(to_columns) > 1:
|
163 |
+
# Instead of having one column for (document, summary) comparisons, split
|
164 |
+
# off into (1 + |summary_columns|) total columns, one for each comparison
|
165 |
+
|
166 |
+
# Retrieve the (document, summary) column
|
167 |
+
doc_summary_column = aligner.retrieve(
|
168 |
+
dataset[:],
|
169 |
+
[doc_column] + to_columns,
|
170 |
+
)[tuple([doc_column] + to_columns)]
|
171 |
+
|
172 |
+
for i, col in enumerate(to_columns):
|
173 |
+
# Add as a new column after encoding with the aligner's `encode` method
|
174 |
+
dataset.add_column(
|
175 |
+
column=str(aligner.identifier(columns=[doc_column, col])),
|
176 |
+
values=[aligner.encode([row[i]]) for row in doc_summary_column],
|
177 |
+
)
|
178 |
+
|
179 |
+
# Remove the (document, summary) column
|
180 |
+
dataset.remove_column(
|
181 |
+
str(
|
182 |
+
aligner.identifier(
|
183 |
+
columns=[doc_column] + to_columns
|
184 |
+
)
|
185 |
+
)
|
186 |
+
)
|
187 |
+
del dataset.interactions[CACHEDOPS].history[
|
188 |
+
(
|
189 |
+
aligner.identifier,
|
190 |
+
strings_as_json(
|
191 |
+
strings=[doc_column] + to_columns
|
192 |
+
)
|
193 |
+
)
|
194 |
+
]
|
195 |
+
|
196 |
+
if reference_column is not None and len(summary_columns) > 1:
|
197 |
+
# Instead of having one column for (reference, summary) comparisons, split
|
198 |
+
# off into (|summary_columns|) total columns, one for each comparison
|
199 |
+
|
200 |
+
# Retrieve the (reference, summary) column
|
201 |
+
reference_summary_column = aligner.retrieve(
|
202 |
+
dataset[:],
|
203 |
+
[reference_column] + summary_columns,
|
204 |
+
)[tuple([reference_column] + summary_columns)]
|
205 |
+
|
206 |
+
for i, col in enumerate(summary_columns):
|
207 |
+
# Add as a new column
|
208 |
+
dataset.add_column(
|
209 |
+
column=str(aligner.identifier(columns=[reference_column, col])),
|
210 |
+
values=[
|
211 |
+
aligner.encode([row[i]]) for row in reference_summary_column
|
212 |
+
]
|
213 |
+
)
|
214 |
+
|
215 |
+
# Remove the (reference, summary) column
|
216 |
+
dataset.remove_column(
|
217 |
+
str(
|
218 |
+
aligner.identifier(
|
219 |
+
columns=[reference_column] + summary_columns
|
220 |
+
)
|
221 |
+
)
|
222 |
+
)
|
223 |
+
del dataset.interactions[CACHEDOPS].history[
|
224 |
+
(
|
225 |
+
aligner.identifier,
|
226 |
+
strings_as_json(
|
227 |
+
strings=[reference_column] + summary_columns
|
228 |
+
)
|
229 |
+
)
|
230 |
+
]
|
231 |
+
|
232 |
+
return dataset
|
233 |
+
|
234 |
+
|
235 |
+
def deanonymize_dataset(
|
236 |
+
rg_path: str,
|
237 |
+
standardized_dataset: Dataset,
|
238 |
+
processed_dataset_path: str = None,
|
239 |
+
n_samples: int = None,
|
240 |
+
|
241 |
+
):
|
242 |
+
"""Take an anonymized dataset and add back the original dataset columns."""
|
243 |
+
assert processed_dataset_path is not None, \
|
244 |
+
"Please specify a path to save the dataset."
|
245 |
+
|
246 |
+
# Load the dataset
|
247 |
+
dataset = Dataset.load_from_disk(rg_path)
|
248 |
+
|
249 |
+
if n_samples:
|
250 |
+
dataset.set_visible_rows(list(range(n_samples)))
|
251 |
+
standardized_dataset.set_visible_rows(list(range(n_samples)))
|
252 |
+
|
253 |
+
text_columns = []
|
254 |
+
|
255 |
+
# Add columns from the standardized dataset
|
256 |
+
dataset.add_column('document', standardized_dataset['document'])
|
257 |
+
text_columns.append('document')
|
258 |
+
|
259 |
+
if 'summary:reference' in standardized_dataset.column_names:
|
260 |
+
dataset.add_column('summary:reference', standardized_dataset['summary:reference'])
|
261 |
+
text_columns.append('summary:reference')
|
262 |
+
|
263 |
+
# Preprocessing all the text columns
|
264 |
+
dataset = dataset.update(
|
265 |
+
lambda x: {f'preprocessed_{k}': preprocess_text(x[k]) for k in text_columns}
|
266 |
+
)
|
267 |
+
|
268 |
+
# Run the Spacy pipeline on all preprocessed text columns
|
269 |
+
try:
|
270 |
+
nlp = load('en_core_web_lg')
|
271 |
+
except OSError:
|
272 |
+
nlp = load('en_core_web_sm')
|
273 |
+
|
274 |
+
nlp.add_pipe('sentencizer', before="parser")
|
275 |
+
spacy = Spacy(nlp=nlp)
|
276 |
+
dataset = spacy(
|
277 |
+
dataset,
|
278 |
+
[f'preprocessed_{col}' for col in text_columns],
|
279 |
+
batch_size=100,
|
280 |
+
)
|
281 |
+
|
282 |
+
# Directly save to disk
|
283 |
+
dataset.save_to_disk(processed_dataset_path)
|
284 |
+
|
285 |
+
return dataset
|
286 |
+
|
287 |
+
|
288 |
+
def run_workflow(
|
289 |
+
jsonl_path: str = None,
|
290 |
+
dataset: Dataset = None,
|
291 |
+
doc_column: str = None,
|
292 |
+
reference_column: str = None,
|
293 |
+
summary_columns: List[str] = None,
|
294 |
+
bert_aligner_threshold: float = 0.5,
|
295 |
+
bert_aligner_top_k: int = 3,
|
296 |
+
embedding_aligner_threshold: float = 0.5,
|
297 |
+
embedding_aligner_top_k: int = 3,
|
298 |
+
processed_dataset_path: str = None,
|
299 |
+
n_samples: int = None,
|
300 |
+
anonymize: bool = False,
|
301 |
+
):
|
302 |
+
assert (jsonl_path is None) != (dataset is None), \
|
303 |
+
"One of `jsonl_path` and `dataset` must be specified."
|
304 |
+
assert processed_dataset_path is not None, \
|
305 |
+
"Please specify a path to save the dataset."
|
306 |
+
|
307 |
+
# Load the dataset
|
308 |
+
if jsonl_path is not None:
|
309 |
+
dataset = Dataset.from_jsonl(jsonl_path)
|
310 |
+
|
311 |
+
if doc_column is None:
|
312 |
+
# Assume `doc_column` is called "document"
|
313 |
+
doc_column = 'document'
|
314 |
+
assert doc_column in dataset.column_names, \
|
315 |
+
f"`doc_column={doc_column}` is not a column in dataset."
|
316 |
+
print("Assuming `doc_column` is called 'document'.")
|
317 |
+
|
318 |
+
if reference_column is None:
|
319 |
+
# Assume `reference_column` is called "summary:reference"
|
320 |
+
reference_column = 'summary:reference'
|
321 |
+
print("Assuming `reference_column` is called 'summary:reference'.")
|
322 |
+
if reference_column not in dataset.column_names:
|
323 |
+
print("No reference summary loaded")
|
324 |
+
reference_column = None
|
325 |
+
|
326 |
+
if summary_columns is None or len(summary_columns) == 0:
|
327 |
+
# Assume `summary_columns` are prefixed by "summary:"
|
328 |
+
summary_columns = []
|
329 |
+
for col in dataset.column_names:
|
330 |
+
if col.startswith("summary:") and col != "summary:reference":
|
331 |
+
summary_columns.append(col)
|
332 |
+
print(f"Reading summary columns from dataset. Found {summary_columns}.")
|
333 |
+
|
334 |
+
if len(summary_columns) == 0 and reference_column is None:
|
335 |
+
raise ValueError("At least one summary is required")
|
336 |
+
|
337 |
+
# Set visible rows to restrict to the first `n_samples`
|
338 |
+
if n_samples:
|
339 |
+
dataset.set_visible_rows(list(range(n_samples)))
|
340 |
+
|
341 |
+
# Combine the text columns into one list
|
342 |
+
text_columns = [doc_column] + ([reference_column] if reference_column else []) + summary_columns
|
343 |
+
|
344 |
+
# Preprocessing all the text columns
|
345 |
+
dataset = dataset.update(
|
346 |
+
lambda x: {f'preprocessed_{k}': preprocess_text(x[k]) for k in text_columns}
|
347 |
+
)
|
348 |
+
|
349 |
+
# Run the Spacy pipeline on all preprocessed text columns
|
350 |
+
nlp = load('en_core_web_lg')
|
351 |
+
nlp.add_pipe('sentencizer', before="parser")
|
352 |
+
spacy = Spacy(nlp=nlp)
|
353 |
+
dataset = spacy(
|
354 |
+
dataset,
|
355 |
+
[f'preprocessed_{col}' for col in text_columns],
|
356 |
+
batch_size=100,
|
357 |
+
)
|
358 |
+
|
359 |
+
# Run the 3 align pipelines
|
360 |
+
bert_aligner = BertscoreAlignerCap(
|
361 |
+
threshold=bert_aligner_threshold,
|
362 |
+
top_k=bert_aligner_top_k,
|
363 |
+
spacy=spacy,
|
364 |
+
)
|
365 |
+
|
366 |
+
embedding_aligner = StaticEmbeddingAlignerCap(
|
367 |
+
threshold=embedding_aligner_threshold,
|
368 |
+
top_k=embedding_aligner_top_k,
|
369 |
+
spacy=spacy,
|
370 |
+
)
|
371 |
+
|
372 |
+
ngram_aligner = NGramAlignerCap(
|
373 |
+
spacy=spacy,
|
374 |
+
)
|
375 |
+
|
376 |
+
dataset = _run_aligners(
|
377 |
+
dataset=dataset,
|
378 |
+
aligners=[bert_aligner, embedding_aligner, ngram_aligner],
|
379 |
+
doc_column=f'preprocessed_{doc_column}',
|
380 |
+
reference_column=f'preprocessed_{reference_column}' if reference_column else None,
|
381 |
+
summary_columns=[f'preprocessed_{col}' for col in summary_columns],
|
382 |
+
)
|
383 |
+
|
384 |
+
# Save the dataset
|
385 |
+
if anonymize:
|
386 |
+
# Remove certain columns to anonymize and save to disk
|
387 |
+
for col in [doc_column, reference_column]:
|
388 |
+
if col is not None:
|
389 |
+
dataset.remove_column(col)
|
390 |
+
dataset.remove_column(f'preprocessed_{col}')
|
391 |
+
dataset.remove_column(
|
392 |
+
str(spacy.identifier(columns=[f'preprocessed_{col}']))
|
393 |
+
)
|
394 |
+
del dataset.interactions[CACHEDOPS].history[
|
395 |
+
(spacy.identifier, f'preprocessed_{col}')
|
396 |
+
]
|
397 |
+
dataset.save_to_disk(f'{processed_dataset_path}.anonymized')
|
398 |
+
else:
|
399 |
+
# Directly save to disk
|
400 |
+
dataset.save_to_disk(processed_dataset_path)
|
401 |
+
|
402 |
+
return dataset
|
403 |
+
|
404 |
+
|
405 |
+
def parse_prediction_jsonl_name(prediction_jsonl: str):
|
406 |
+
"""Parse the name of the prediction_jsonl to extract useful information."""
|
407 |
+
# Analyze the name of the prediction_jsonl
|
408 |
+
filename = prediction_jsonl.split("/")[-1]
|
409 |
+
|
410 |
+
# Check that the filename ends with `.results.anonymized`
|
411 |
+
if filename.endswith(".results.anonymized"):
|
412 |
+
# Fmt: <model>-<training dataset>.<eval dataset>.<eval split>.results.anonymized
|
413 |
+
|
414 |
+
# Split using a period
|
415 |
+
model_train_dataset, eval_dataset, eval_split = filename.split(".")[:-2]
|
416 |
+
model, train_dataset = model_train_dataset.split("-")
|
417 |
+
|
418 |
+
return SimpleNamespace(
|
419 |
+
model_train_dataset=model_train_dataset,
|
420 |
+
model=model,
|
421 |
+
train_dataset=train_dataset,
|
422 |
+
eval_dataset=eval_dataset,
|
423 |
+
eval_split=eval_split,
|
424 |
+
)
|
425 |
+
|
426 |
+
raise NotImplementedError(
|
427 |
+
"Prediction files must be named "
|
428 |
+
"<model>-<training dataset>.<eval dataset>.<eval split>.results.anonymized. "
|
429 |
+
f"Please rename the prediction file {filename} and run again."
|
430 |
+
)
|
431 |
+
|
432 |
+
|
433 |
+
def join_predictions(
|
434 |
+
dataset_jsonl: str = None,
|
435 |
+
prediction_jsonls: str = None,
|
436 |
+
save_jsonl_path: str = None,
|
437 |
+
):
|
438 |
+
"""Join predictions with a dataset."""
|
439 |
+
assert prediction_jsonls is not None, "Must have prediction jsonl files."
|
440 |
+
|
441 |
+
print(
|
442 |
+
"> Warning: please inspect the prediction .jsonl file to make sure that "
|
443 |
+
"predictions are aligned with the examples in the dataset. "
|
444 |
+
"Use `get_dataset` to inspect the dataset."
|
445 |
+
)
|
446 |
+
|
447 |
+
# Load the dataset
|
448 |
+
dataset = get_dataset(dataset_jsonl=dataset_jsonl)
|
449 |
+
|
450 |
+
# Parse names of all prediction files to get metadata
|
451 |
+
metadata = [
|
452 |
+
parse_prediction_jsonl_name(prediction_jsonl)
|
453 |
+
for prediction_jsonl in prediction_jsonls
|
454 |
+
]
|
455 |
+
|
456 |
+
# Load the predictions
|
457 |
+
predictions = [
|
458 |
+
Dataset.from_jsonl(json_path=prediction_jsonl)
|
459 |
+
for prediction_jsonl in prediction_jsonls
|
460 |
+
]
|
461 |
+
|
462 |
+
# Predictions for a model
|
463 |
+
for i, prediction_data in enumerate(predictions):
|
464 |
+
# Get metadata for i_th prediction file
|
465 |
+
metadata_i = metadata[i]
|
466 |
+
|
467 |
+
# Construct a prefix for columns added to the dataset for this prediction file
|
468 |
+
prefix = metadata_i.model_train_dataset
|
469 |
+
|
470 |
+
# Add the predictions column to the dataset
|
471 |
+
for col in prediction_data.column_names:
|
472 |
+
# Don't add the indexing information since the dataset has it already
|
473 |
+
if col not in {'index', 'ix', 'id'}:
|
474 |
+
# `add_column` will automatically ensure that column lengths match
|
475 |
+
if col == 'decoded': # rename decoded to summary
|
476 |
+
dataset.add_column(f'summary:{prefix}', prediction_data[col])
|
477 |
+
else:
|
478 |
+
dataset.add_column(f'{prefix}:{col}', prediction_data[col])
|
479 |
+
|
480 |
+
# Save the dataset back to disk
|
481 |
+
if save_jsonl_path:
|
482 |
+
dataset.to_jsonl(save_jsonl_path)
|
483 |
+
else:
|
484 |
+
print("Dataset with predictions was not saved since `save_jsonl_path` "
|
485 |
+
"was not specified.")
|
486 |
+
|
487 |
+
return dataset
|
488 |
+
|
489 |
+
|
490 |
+
def standardize_dataset(
|
491 |
+
dataset_name: str = None,
|
492 |
+
dataset_version: str = None,
|
493 |
+
dataset_split: str = 'test',
|
494 |
+
dataset_jsonl: str = None,
|
495 |
+
doc_column: str = None,
|
496 |
+
reference_column: str = None,
|
497 |
+
save_jsonl_path: str = None,
|
498 |
+
no_save: bool = False,
|
499 |
+
):
|
500 |
+
"""Load a dataset from Huggingface and dump it to disk."""
|
501 |
+
# Load the dataset from Huggingface
|
502 |
+
dataset = get_dataset(
|
503 |
+
dataset_name=dataset_name,
|
504 |
+
dataset_version=dataset_version,
|
505 |
+
dataset_split=dataset_split,
|
506 |
+
dataset_jsonl=dataset_jsonl,
|
507 |
+
)
|
508 |
+
|
509 |
+
if doc_column is None:
|
510 |
+
if reference_column is not None:
|
511 |
+
raise ValueError("You must specify `doc_column` if you specify `reference_column`")
|
512 |
+
try:
|
513 |
+
doc_column, reference_column = {
|
514 |
+
'cnn_dailymail': ('article', 'highlights'),
|
515 |
+
'xsum': ('document', 'summary')
|
516 |
+
}[dataset_name]
|
517 |
+
except:
|
518 |
+
raise NotImplementedError(
|
519 |
+
"Please specify `doc_column`."
|
520 |
+
)
|
521 |
+
|
522 |
+
# Rename the columns
|
523 |
+
if doc_column != 'document':
|
524 |
+
dataset.add_column('document', dataset[doc_column])
|
525 |
+
dataset.remove_column(doc_column)
|
526 |
+
dataset.add_column('summary:reference', dataset[reference_column])
|
527 |
+
dataset.remove_column(reference_column)
|
528 |
+
|
529 |
+
# Save the dataset back to disk
|
530 |
+
if save_jsonl_path:
|
531 |
+
dataset.to_jsonl(save_jsonl_path)
|
532 |
+
|
533 |
+
elif (save_jsonl_path is None) and not no_save:
|
534 |
+
# Auto-create a path to save the standardized dataset
|
535 |
+
os.makedirs('preprocessing', exist_ok=True)
|
536 |
+
if not dataset_jsonl:
|
537 |
+
dataset.to_jsonl(
|
538 |
+
f'preprocessing/'
|
539 |
+
f'standardized_{dataset_name}_{dataset_version}_{dataset_split}.jsonl'
|
540 |
+
)
|
541 |
+
else:
|
542 |
+
dataset.to_jsonl(
|
543 |
+
f'preprocessing/'
|
544 |
+
f'standardized_{dataset_jsonl.split("/")[-1]}'
|
545 |
+
)
|
546 |
+
|
547 |
+
return dataset
|
548 |
+
|
549 |
+
|
550 |
+
def get_dataset(
|
551 |
+
dataset_name: str = None,
|
552 |
+
dataset_version: str = None,
|
553 |
+
dataset_split: str = 'test',
|
554 |
+
dataset_jsonl: str = None,
|
555 |
+
):
|
556 |
+
"""Load a dataset."""
|
557 |
+
assert (dataset_name is not None) != (dataset_jsonl is not None), \
|
558 |
+
"Specify one of `dataset_name` or `dataset_jsonl`."
|
559 |
+
|
560 |
+
# Load the dataset
|
561 |
+
if dataset_name is not None:
|
562 |
+
return get_hf_dataset(dataset_name, dataset_version, dataset_split)
|
563 |
+
|
564 |
+
return Dataset.from_jsonl(json_path=dataset_jsonl)
|
565 |
+
|
566 |
+
|
567 |
+
def get_hf_dataset(name: str, version: str = None, split: str = 'test'):
|
568 |
+
"""Get dataset from Huggingface."""
|
569 |
+
if version:
|
570 |
+
return Dataset.load_dataset(name, version, split=split)
|
571 |
+
return Dataset.load_dataset(name, split=split)
|
572 |
+
|
573 |
+
|
574 |
+
if __name__ == '__main__':
|
575 |
+
parser = ArgumentParser()
|
576 |
+
parser.add_argument('--dataset', type=str, choices=['cnn_dailymail', 'xsum'],
|
577 |
+
help="Huggingface dataset name.")
|
578 |
+
parser.add_argument('--version', type=str,
|
579 |
+
help="Huggingface dataset version.")
|
580 |
+
parser.add_argument('--split', type=str, default='test',
|
581 |
+
help="Huggingface dataset split.")
|
582 |
+
parser.add_argument('--dataset_jsonl', type=str,
|
583 |
+
help="Path to a jsonl file for the dataset.")
|
584 |
+
parser.add_argument('--dataset_rg', type=str,
|
585 |
+
help="Path to a dataset stored in the Robustness Gym format. "
|
586 |
+
"All processed datasets are stored in this format.")
|
587 |
+
parser.add_argument('--prediction_jsonls', nargs='+', default=[],
|
588 |
+
help="Path to one or more jsonl files for the predictions.")
|
589 |
+
parser.add_argument('--save_jsonl_path', type=str,
|
590 |
+
help="Path to save the processed jsonl dataset.")
|
591 |
+
|
592 |
+
parser.add_argument('--doc_column', type=str,
|
593 |
+
help="Name of the document column in the dataset.")
|
594 |
+
parser.add_argument('--reference_column', type=str,
|
595 |
+
help="Name of the reference summary column in the dataset.")
|
596 |
+
parser.add_argument('--summary_columns', nargs='+', default=[],
|
597 |
+
help="Name of other summary columns in/added to the dataset.")
|
598 |
+
|
599 |
+
parser.add_argument('--bert_aligner_threshold', type=float, default=0.1,
|
600 |
+
help="Minimum threshold for BERT alignment.")
|
601 |
+
parser.add_argument('--bert_aligner_top_k', type=int, default=10,
|
602 |
+
help="Top-k for BERT alignment.")
|
603 |
+
parser.add_argument('--embedding_aligner_threshold', type=float, default=0.1,
|
604 |
+
help="Minimum threshold for embedding alignment.")
|
605 |
+
parser.add_argument('--embedding_aligner_top_k', type=int, default=10,
|
606 |
+
help="Top-k for embedding alignment.")
|
607 |
+
parser.add_argument('--processed_dataset_path', type=str,
|
608 |
+
help="Path to store the final processed dataset.")
|
609 |
+
parser.add_argument('--n_samples', type=int,
|
610 |
+
help="Number of dataset samples to process.")
|
611 |
+
|
612 |
+
parser.add_argument('--workflow', action='store_true', default=False,
|
613 |
+
help="Whether to run the preprocessing workflow.")
|
614 |
+
parser.add_argument('--standardize', action='store_true', default=False,
|
615 |
+
help="Whether to standardize the dataset and save to jsonl.")
|
616 |
+
parser.add_argument('--join_predictions', action='store_true', default=False,
|
617 |
+
help="Whether to add predictions to the dataset and save to "
|
618 |
+
"jsonl.")
|
619 |
+
parser.add_argument('--try_it', action='store_true', default=False,
|
620 |
+
help="`Try it` mode is faster and runs processing on 10 "
|
621 |
+
"examples.")
|
622 |
+
parser.add_argument('--deanonymize', action='store_true', default=False,
|
623 |
+
help="Deanonymize the dataset provided by summvis.")
|
624 |
+
parser.add_argument('--anonymize', action='store_true', default=False,
|
625 |
+
help="Anonymize by removing document and reference summary "
|
626 |
+
"columns of the original dataset.")
|
627 |
+
|
628 |
+
args = parser.parse_args()
|
629 |
+
|
630 |
+
if args.standardize:
|
631 |
+
# Dump a dataset to jsonl on disk after standardizing it
|
632 |
+
standardize_dataset(
|
633 |
+
dataset_name=args.dataset,
|
634 |
+
dataset_version=args.version,
|
635 |
+
dataset_split=args.split,
|
636 |
+
dataset_jsonl=args.dataset_jsonl,
|
637 |
+
doc_column=args.doc_column,
|
638 |
+
reference_column=args.reference_column,
|
639 |
+
save_jsonl_path=args.save_jsonl_path,
|
640 |
+
)
|
641 |
+
|
642 |
+
if args.join_predictions:
|
643 |
+
# Join the predictions with the dataset
|
644 |
+
dataset = join_predictions(
|
645 |
+
dataset_jsonl=args.dataset_jsonl,
|
646 |
+
prediction_jsonls=args.prediction_jsonls,
|
647 |
+
save_jsonl_path=args.save_jsonl_path,
|
648 |
+
)
|
649 |
+
|
650 |
+
if args.workflow:
|
651 |
+
# Run the processing workflow
|
652 |
+
dataset = None
|
653 |
+
# Check if `args.dataset_rg` was passed in
|
654 |
+
if args.dataset_rg:
|
655 |
+
# Load the dataset directly
|
656 |
+
dataset = Dataset.load_from_disk(args.dataset_rg)
|
657 |
+
|
658 |
+
run_workflow(
|
659 |
+
jsonl_path=args.dataset_jsonl,
|
660 |
+
dataset=dataset,
|
661 |
+
doc_column=args.doc_column,
|
662 |
+
reference_column=args.reference_column,
|
663 |
+
summary_columns=args.summary_columns,
|
664 |
+
bert_aligner_threshold=args.bert_aligner_threshold,
|
665 |
+
bert_aligner_top_k=args.bert_aligner_top_k,
|
666 |
+
embedding_aligner_threshold=args.embedding_aligner_threshold,
|
667 |
+
embedding_aligner_top_k=args.embedding_aligner_top_k,
|
668 |
+
processed_dataset_path=args.processed_dataset_path,
|
669 |
+
n_samples=args.n_samples if not args.try_it else 10,
|
670 |
+
anonymize=args.anonymize,
|
671 |
+
)
|
672 |
+
|
673 |
+
if args.deanonymize:
|
674 |
+
# Deanonymize an anonymized dataset
|
675 |
+
# Check if `args.dataset_rg` was passed in
|
676 |
+
assert args.dataset_rg is not None, \
|
677 |
+
"Must specify `dataset_rg` path to be deanonymized."
|
678 |
+
assert args.dataset_rg.endswith('anonymized'), \
|
679 |
+
"`dataset_rg` must end in 'anonymized'."
|
680 |
+
assert (args.dataset is None) != (args.dataset_jsonl is None), \
|
681 |
+
"`dataset_rg` points to an anonymized dataset that will be " \
|
682 |
+
"deanonymized. Please pass in relevant arguments: either " \
|
683 |
+
"`dataset`, `version` and `split` OR `dataset_jsonl`."
|
684 |
+
|
685 |
+
# Load the standardized dataset
|
686 |
+
standardized_dataset = standardize_dataset(
|
687 |
+
dataset_name=args.dataset,
|
688 |
+
dataset_version=args.version,
|
689 |
+
dataset_split=args.split,
|
690 |
+
dataset_jsonl=args.dataset_jsonl,
|
691 |
+
doc_column=args.doc_column,
|
692 |
+
reference_column=args.reference_column,
|
693 |
+
no_save=True,
|
694 |
+
)
|
695 |
+
# Use it to deanonymize
|
696 |
+
dataset = deanonymize_dataset(
|
697 |
+
rg_path=args.dataset_rg,
|
698 |
+
standardized_dataset=standardized_dataset,
|
699 |
+
processed_dataset_path=args.processed_dataset_path,
|
700 |
+
n_samples=args.n_samples if not args.try_it else 10,
|
701 |
+
)
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
spacy==3.0.3
|
2 |
+
streamlit==0.77.0
|
3 |
+
st-annotated-text==1.1.0
|
4 |
+
transformers==4.2.2
|
5 |
+
torch==1.7.1
|
6 |
+
bert-score==0.3.7
|
7 |
+
rouge-score==0.0.4
|
8 |
+
toolz==0.11.1
|
9 |
+
nltk==3.4.5
|
10 |
+
robustnessgym==0.0.4a0
|
11 |
+
sentencepiece==0.1.95
|
resources/jquery.color-2.1.2.min.js
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
/*! jQuery Color v@2.1.2 http://github.com/jquery/jquery-color | jquery.org/license */
|
2 |
+
(function(a,b){function m(a,b,c){var d=h[b.type]||{};return a==null?c||!b.def?null:b.def:(a=d.floor?~~a:parseFloat(a),isNaN(a)?b.def:d.mod?(a+d.mod)%d.mod:0>a?0:d.max<a?d.max:a)}function n(b){var c=f(),d=c._rgba=[];return b=b.toLowerCase(),l(e,function(a,e){var f,h=e.re.exec(b),i=h&&e.parse(h),j=e.space||"rgba";if(i)return f=c[j](i),c[g[j].cache]=f[g[j].cache],d=c._rgba=f._rgba,!1}),d.length?(d.join()==="0,0,0,0"&&a.extend(d,k.transparent),c):k[b]}function o(a,b,c){return c=(c+1)%1,c*6<1?a+(b-a)*c*6:c*2<1?b:c*3<2?a+(b-a)*(2/3-c)*6:a}var c="backgroundColor borderBottomColor borderLeftColor borderRightColor borderTopColor color columnRuleColor outlineColor textDecorationColor textEmphasisColor",d=/^([\-+])=\s*(\d+\.?\d*)/,e=[{re:/rgba?\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,\s*(\d{1,3})\s*(?:,\s*(\d?(?:\.\d+)?)\s*)?\)/,parse:function(a){return[a[1],a[2],a[3],a[4]]}},{re:/rgba?\(\s*(\d+(?:\.\d+)?)\%\s*,\s*(\d+(?:\.\d+)?)\%\s*,\s*(\d+(?:\.\d+)?)\%\s*(?:,\s*(\d?(?:\.\d+)?)\s*)?\)/,parse:function(a){return[a[1]*2.55,a[2]*2.55,a[3]*2.55,a[4]]}},{re:/#([a-f0-9]{2})([a-f0-9]{2})([a-f0-9]{2})/,parse:function(a){return[parseInt(a[1],16),parseInt(a[2],16),parseInt(a[3],16)]}},{re:/#([a-f0-9])([a-f0-9])([a-f0-9])/,parse:function(a){return[parseInt(a[1]+a[1],16),parseInt(a[2]+a[2],16),parseInt(a[3]+a[3],16)]}},{re:/hsla?\(\s*(\d+(?:\.\d+)?)\s*,\s*(\d+(?:\.\d+)?)\%\s*,\s*(\d+(?:\.\d+)?)\%\s*(?:,\s*(\d?(?:\.\d+)?)\s*)?\)/,space:"hsla",parse:function(a){return[a[1],a[2]/100,a[3]/100,a[4]]}}],f=a.Color=function(b,c,d,e){return new a.Color.fn.parse(b,c,d,e)},g={rgba:{props:{red:{idx:0,type:"byte"},green:{idx:1,type:"byte"},blue:{idx:2,type:"byte"}}},hsla:{props:{hue:{idx:0,type:"degrees"},saturation:{idx:1,type:"percent"},lightness:{idx:2,type:"percent"}}}},h={"byte":{floor:!0,max:255},percent:{max:1},degrees:{mod:360,floor:!0}},i=f.support={},j=a("<p>")[0],k,l=a.each;j.style.cssText="background-color:rgba(1,1,1,.5)",i.rgba=j.style.backgroundColor.indexOf("rgba")>-1,l(g,function(a,b){b.cache="_"+a,b.props.alpha={idx:3,type:"percent",def:1}}),f.fn=a.extend(f.prototype,{parse:function(c,d,e,h){if(c===b)return this._rgba=[null,null,null,null],this;if(c.jquery||c.nodeType)c=a(c).css(d),d=b;var i=this,j=a.type(c),o=this._rgba=[];d!==b&&(c=[c,d,e,h],j="array");if(j==="string")return this.parse(n(c)||k._default);if(j==="array")return l(g.rgba.props,function(a,b){o[b.idx]=m(c[b.idx],b)}),this;if(j==="object")return c instanceof f?l(g,function(a,b){c[b.cache]&&(i[b.cache]=c[b.cache].slice())}):l(g,function(b,d){var e=d.cache;l(d.props,function(a,b){if(!i[e]&&d.to){if(a==="alpha"||c[a]==null)return;i[e]=d.to(i._rgba)}i[e][b.idx]=m(c[a],b,!0)}),i[e]&&a.inArray(null,i[e].slice(0,3))<0&&(i[e][3]=1,d.from&&(i._rgba=d.from(i[e])))}),this},is:function(a){var b=f(a),c=!0,d=this;return l(g,function(a,e){var f,g=b[e.cache];return g&&(f=d[e.cache]||e.to&&e.to(d._rgba)||[],l(e.props,function(a,b){if(g[b.idx]!=null)return c=g[b.idx]===f[b.idx],c})),c}),c},_space:function(){var a=[],b=this;return l(g,function(c,d){b[d.cache]&&a.push(c)}),a.pop()},transition:function(a,b){var c=f(a),d=c._space(),e=g[d],i=this.alpha()===0?f("transparent"):this,j=i[e.cache]||e.to(i._rgba),k=j.slice();return c=c[e.cache],l(e.props,function(a,d){var e=d.idx,f=j[e],g=c[e],i=h[d.type]||{};if(g===null)return;f===null?k[e]=g:(i.mod&&(g-f>i.mod/2?f+=i.mod:f-g>i.mod/2&&(f-=i.mod)),k[e]=m((g-f)*b+f,d))}),this[d](k)},blend:function(b){if(this._rgba[3]===1)return this;var c=this._rgba.slice(),d=c.pop(),e=f(b)._rgba;return f(a.map(c,function(a,b){return(1-d)*e[b]+d*a}))},toRgbaString:function(){var b="rgba(",c=a.map(this._rgba,function(a,b){return a==null?b>2?1:0:a});return c[3]===1&&(c.pop(),b="rgb("),b+c.join()+")"},toHslaString:function(){var b="hsla(",c=a.map(this.hsla(),function(a,b){return a==null&&(a=b>2?1:0),b&&b<3&&(a=Math.round(a*100)+"%"),a});return c[3]===1&&(c.pop(),b="hsl("),b+c.join()+")"},toHexString:function(b){var c=this._rgba.slice(),d=c.pop();return b&&c.push(~~(d*255)),"#"+a.map(c,function(a){return a=(a||0).toString(16),a.length===1?"0"+a:a}).join("")},toString:function(){return this._rgba[3]===0?"transparent":this.toRgbaString()}}),f.fn.parse.prototype=f.fn,g.hsla.to=function(a){if(a[0]==null||a[1]==null||a[2]==null)return[null,null,null,a[3]];var b=a[0]/255,c=a[1]/255,d=a[2]/255,e=a[3],f=Math.max(b,c,d),g=Math.min(b,c,d),h=f-g,i=f+g,j=i*.5,k,l;return g===f?k=0:b===f?k=60*(c-d)/h+360:c===f?k=60*(d-b)/h+120:k=60*(b-c)/h+240,h===0?l=0:j<=.5?l=h/i:l=h/(2-i),[Math.round(k)%360,l,j,e==null?1:e]},g.hsla.from=function(a){if(a[0]==null||a[1]==null||a[2]==null)return[null,null,null,a[3]];var b=a[0]/360,c=a[1],d=a[2],e=a[3],f=d<=.5?d*(1+c):d+c-d*c,g=2*d-f;return[Math.round(o(g,f,b+1/3)*255),Math.round(o(g,f,b)*255),Math.round(o(g,f,b-1/3)*255),e]},l(g,function(c,e){var g=e.props,h=e.cache,i=e.to,j=e.from;f.fn[c]=function(c){i&&!this[h]&&(this[h]=i(this._rgba));if(c===b)return this[h].slice();var d,e=a.type(c),k=e==="array"||e==="object"?c:arguments,n=this[h].slice();return l(g,function(a,b){var c=k[e==="object"?a:b.idx];c==null&&(c=n[b.idx]),n[b.idx]=m(c,b)}),j?(d=f(j(n)),d[h]=n,d):f(n)},l(g,function(b,e){if(f.fn[b])return;f.fn[b]=function(f){var g=a.type(f),h=b==="alpha"?this._hsla?"hsla":"rgba":c,i=this[h](),j=i[e.idx],k;return g==="undefined"?j:(g==="function"&&(f=f.call(this,j),g=a.type(f)),f==null&&e.empty?this:(g==="string"&&(k=d.exec(f),k&&(f=j+parseFloat(k[2])*(k[1]==="+"?1:-1))),i[e.idx]=f,this[h](i)))}})}),f.hook=function(b){var c=b.split(" ");l(c,function(b,c){a.cssHooks[c]={set:function(b,d){var e,g,h="";if(d!=="transparent"&&(a.type(d)!=="string"||(e=n(d)))){d=f(e||d);if(!i.rgba&&d._rgba[3]!==1){g=c==="backgroundColor"?b.parentNode:b;while((h===""||h==="transparent")&&g&&g.style)try{h=a.css(g,"backgroundColor"),g=g.parentNode}catch(j){}d=d.blend(h&&h!=="transparent"?h:"_default")}d=d.toRgbaString()}try{b.style[c]=d}catch(j){}}},a.fx.step[c]=function(b){b.colorInit||(b.start=f(b.elem,c),b.end=f(b.end),b.colorInit=!0),a.cssHooks[c].set(b.elem,b.start.transition(b.end,b.pos))}})},f.hook(c),a.cssHooks.borderColor={expand:function(a){var b={};return l(["Top","Right","Bottom","Left"],function(c,d){b["border"+d+"Color"]=a}),b}},k=a.Color.names={aqua:"#00ffff",black:"#000000",blue:"#0000ff",fuchsia:"#ff00ff",gray:"#808080",green:"#008000",lime:"#00ff00",maroon:"#800000",navy:"#000080",olive:"#808000",purple:"#800080",red:"#ff0000",silver:"#c0c0c0",teal:"#008080",white:"#ffffff",yellow:"#ffff00",transparent:[null,null,null,0],_default:"#ffffff"}})(jQuery);
|
resources/summvis.css
ADDED
@@ -0,0 +1,347 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
body {
|
2 |
+
font-family: 'Roboto', sans-serif;
|
3 |
+
font-weight: 400;
|
4 |
+
line-height: 1.5;
|
5 |
+
color: #262730;
|
6 |
+
font-weight: 400;
|
7 |
+
}
|
8 |
+
|
9 |
+
.vis-container {
|
10 |
+
height: 670px;
|
11 |
+
background-color: #F5F7F9;
|
12 |
+
}
|
13 |
+
|
14 |
+
.nodisplay {
|
15 |
+
display: none !important;
|
16 |
+
}
|
17 |
+
|
18 |
+
.scroll {
|
19 |
+
overflow-y: scroll;
|
20 |
+
}
|
21 |
+
|
22 |
+
.doc-container {
|
23 |
+
padding: 10px 20px;
|
24 |
+
}
|
25 |
+
|
26 |
+
.horizontal-layout .doc-container {
|
27 |
+
padding-bottom: 0px;
|
28 |
+
}
|
29 |
+
|
30 |
+
.vertical-layout .doc-container {
|
31 |
+
float: left;
|
32 |
+
width: 50%;
|
33 |
+
padding-right: 0px;
|
34 |
+
}
|
35 |
+
|
36 |
+
.summary-container {
|
37 |
+
padding: 0px 20px;
|
38 |
+
}
|
39 |
+
|
40 |
+
.vertical-layout .summary-container {
|
41 |
+
float: left;
|
42 |
+
width: 50%;
|
43 |
+
padding-top: 8px;
|
44 |
+
}
|
45 |
+
|
46 |
+
.vertical-layout .main-doc.scroll {
|
47 |
+
height: 610px;
|
48 |
+
}
|
49 |
+
|
50 |
+
.main-doc.scroll {
|
51 |
+
scrollbar-width: none;
|
52 |
+
}
|
53 |
+
|
54 |
+
/* Works on Chrome, Edge, and Safari */
|
55 |
+
.main-doc.scroll::-webkit-scrollbar {
|
56 |
+
width: 0;
|
57 |
+
}
|
58 |
+
|
59 |
+
.vertical-layout .proxy-doc {
|
60 |
+
height: 610px;
|
61 |
+
}
|
62 |
+
|
63 |
+
.vertical-layout .summary-list.scroll {
|
64 |
+
height: 610px;
|
65 |
+
}
|
66 |
+
|
67 |
+
.horizontal-layout .scroll {
|
68 |
+
height: 270px;
|
69 |
+
}
|
70 |
+
|
71 |
+
.doc {
|
72 |
+
display: flex;
|
73 |
+
}
|
74 |
+
|
75 |
+
.horizontal-layout .doc {
|
76 |
+
}
|
77 |
+
|
78 |
+
.main-doc {
|
79 |
+
background-color: white;
|
80 |
+
padding-left: 17px;
|
81 |
+
padding-right: 15px;
|
82 |
+
padding-top: 16px;
|
83 |
+
border-top-left-radius: 4px;
|
84 |
+
border-bottom-left-radius: 4px;
|
85 |
+
flex: 1;
|
86 |
+
border: 1px solid #e9e9e9;
|
87 |
+
}
|
88 |
+
|
89 |
+
.display .proxy-scroll {
|
90 |
+
position: absolute;
|
91 |
+
left: 9px;
|
92 |
+
width: 9px;
|
93 |
+
border-radius: 6px;
|
94 |
+
background-color: rgba(0, 0, 0, 0.1);
|
95 |
+
}
|
96 |
+
|
97 |
+
.display .proxy-scroll.hover {
|
98 |
+
background-color: rgba(0, 0, 0, 0.2);
|
99 |
+
}
|
100 |
+
|
101 |
+
.proxy-doc {
|
102 |
+
flex: 0 0 28px;
|
103 |
+
background-color: white;
|
104 |
+
position: relative;
|
105 |
+
border-bottom-right-radius: 4px;
|
106 |
+
border-top-right-radius: 4px;
|
107 |
+
padding-left: 3px;
|
108 |
+
padding-right: 3px;
|
109 |
+
border-top: 1px solid #e9e9e9;
|
110 |
+
border-right: 1px solid #e9e9e9;
|
111 |
+
border-bottom: 1px solid #e9e9e9;
|
112 |
+
}
|
113 |
+
|
114 |
+
.vertical-layout .proxy-doc {
|
115 |
+
margin-right: 25px;
|
116 |
+
}
|
117 |
+
|
118 |
+
.summary-list {
|
119 |
+
border-top: 1px solid #ccc;
|
120 |
+
border-bottom: 1px solid #ccc;
|
121 |
+
border-radius: 4px;
|
122 |
+
}
|
123 |
+
|
124 |
+
.summary-item {
|
125 |
+
border-bottom: 1px solid #ccc;
|
126 |
+
border-left: 1px solid #ccc;
|
127 |
+
border-right: 1px solid #ccc;
|
128 |
+
background-color: white;
|
129 |
+
padding-top: 16px;
|
130 |
+
padding-bottom: 16px;
|
131 |
+
padding-left: 23px;
|
132 |
+
padding-right: 8px;
|
133 |
+
}
|
134 |
+
|
135 |
+
.summary-item:last-child {
|
136 |
+
border-bottom: 0px;
|
137 |
+
border-bottom-left-radius: 3px;
|
138 |
+
}
|
139 |
+
|
140 |
+
.summary-item.selected.selectable {
|
141 |
+
border-left: 3px solid #2377E9;
|
142 |
+
padding-left: 21px;
|
143 |
+
}
|
144 |
+
|
145 |
+
.summary-item.selectable:not(.selected):hover {
|
146 |
+
cursor: pointer;
|
147 |
+
background-color: #FCFDFF;
|
148 |
+
}
|
149 |
+
|
150 |
+
.summary-item.selected.selectable .highlight:not(.annotation-hidden):hover {
|
151 |
+
cursor: pointer;
|
152 |
+
}
|
153 |
+
|
154 |
+
.summary-item.selected.selectable .underline:not(.annotation-hidden):hover {
|
155 |
+
cursor: pointer;
|
156 |
+
}
|
157 |
+
|
158 |
+
.summary-item .name {
|
159 |
+
margin-bottom: 8px;
|
160 |
+
font-weight: 400;
|
161 |
+
}
|
162 |
+
|
163 |
+
.summary-item.selected.selectable .name {
|
164 |
+
font-weight: 500;
|
165 |
+
}
|
166 |
+
|
167 |
+
.inactive {
|
168 |
+
opacity: 0.5 !important;
|
169 |
+
}
|
170 |
+
|
171 |
+
.stopword.grayed-out {
|
172 |
+
opacity: 50%
|
173 |
+
}
|
174 |
+
|
175 |
+
.has-lexical-alignment .annotate-novel {
|
176 |
+
/* Bold all non-underlined items */
|
177 |
+
font-weight: 500;
|
178 |
+
color: black;
|
179 |
+
}
|
180 |
+
|
181 |
+
.summary-item .stopword {
|
182 |
+
font-weight: 400;
|
183 |
+
}
|
184 |
+
|
185 |
+
.summary-item .token-underline {
|
186 |
+
font-weight: 400;
|
187 |
+
}
|
188 |
+
|
189 |
+
.summary-item:not(.selected) .underline, .summary-item:not(.selectable) .underline {
|
190 |
+
border-color: #909090 !important;
|
191 |
+
}
|
192 |
+
|
193 |
+
.underline.annotation-inactive {
|
194 |
+
border-color: #E9E9E9 !important;
|
195 |
+
}
|
196 |
+
|
197 |
+
.underline.annotation-invisible {
|
198 |
+
border-color: transparent !important;
|
199 |
+
}
|
200 |
+
|
201 |
+
.underline.annotation-hidden {
|
202 |
+
border: 0px !important;
|
203 |
+
margin: 0px !important;
|
204 |
+
}
|
205 |
+
|
206 |
+
.proxy-underline.annotation-hidden, .proxy-highlight.annotation-hidden {
|
207 |
+
visibility: hidden;
|
208 |
+
}
|
209 |
+
|
210 |
+
.proxy-underline.annotation-inactive {
|
211 |
+
background-color: #E9E9E9 !important;
|
212 |
+
}
|
213 |
+
|
214 |
+
.proxy-underline.annotation-invisible {
|
215 |
+
background-color: transparent !important;
|
216 |
+
}
|
217 |
+
|
218 |
+
.highlight {
|
219 |
+
display: inline-block;
|
220 |
+
}
|
221 |
+
|
222 |
+
.highlight.annotation-hidden {
|
223 |
+
background: none !important;
|
224 |
+
border-color: transparent !important;
|
225 |
+
border-bottom: 0px !important;
|
226 |
+
}
|
227 |
+
|
228 |
+
.highlight.annotation-invisible {
|
229 |
+
background-color: transparent !important;
|
230 |
+
border-color: transparent !important;
|
231 |
+
}
|
232 |
+
|
233 |
+
.summary-item:not(.selected) .highlight:not(.annotation-hidden),
|
234 |
+
.summary-item:not(.selectable) .highlight:not(.annotation-hidden) {
|
235 |
+
border-color: #909090 !important;
|
236 |
+
}
|
237 |
+
|
238 |
+
.highlight.annotation-inactive {
|
239 |
+
border-color: #E9E9E9 !important;
|
240 |
+
}
|
241 |
+
|
242 |
+
.display .proxy-scroll.hidden {
|
243 |
+
visibility: hidden;
|
244 |
+
}
|
245 |
+
|
246 |
+
#document-header {
|
247 |
+
min-height: 35px;
|
248 |
+
margin-bottom: 0px;
|
249 |
+
align-items: center;
|
250 |
+
color: black;
|
251 |
+
display: flex;
|
252 |
+
}
|
253 |
+
|
254 |
+
#summary-header {
|
255 |
+
display: flex;
|
256 |
+
justify-content: space-between;
|
257 |
+
align-items: center;
|
258 |
+
min-height: 35px;
|
259 |
+
margin-bottom: 0px;
|
260 |
+
color: black;
|
261 |
+
}
|
262 |
+
|
263 |
+
.horizontal-layout #summary-header {
|
264 |
+
margin-top: 23px;
|
265 |
+
}
|
266 |
+
|
267 |
+
#summary-header-gap {
|
268 |
+
flex: 1 0 15px;
|
269 |
+
}
|
270 |
+
|
271 |
+
.highlight.selected {
|
272 |
+
border-color: transparent !important;
|
273 |
+
}
|
274 |
+
|
275 |
+
.highlight:not(.selected), .proxy-highlight:not(.selected) {
|
276 |
+
background-color: transparent !important;
|
277 |
+
}
|
278 |
+
|
279 |
+
.summary-item.annotate-entities .entity:not(.matches-ngram) {
|
280 |
+
color: #fb425c;
|
281 |
+
font-weight: 500;
|
282 |
+
}
|
283 |
+
|
284 |
+
.summary-item.annotate-lexical .highlight.matches-ngram {
|
285 |
+
padding: 0px;
|
286 |
+
border-bottom: 0px !important;
|
287 |
+
}
|
288 |
+
|
289 |
+
.doc .highlight {
|
290 |
+
padding: 0px;
|
291 |
+
border: 0px !important;
|
292 |
+
}
|
293 |
+
|
294 |
+
ul.annotation-key {
|
295 |
+
display: flex;
|
296 |
+
align-items: flex-end;
|
297 |
+
list-style: none;
|
298 |
+
justify-content: flex-start;
|
299 |
+
padding: 0px;
|
300 |
+
margin: 0px 0px 10px 0px;
|
301 |
+
}
|
302 |
+
|
303 |
+
.annotation-key li {
|
304 |
+
margin-right: 15px;
|
305 |
+
font-size: 13px;
|
306 |
+
padding: 6px 13px 6px 13px;
|
307 |
+
}
|
308 |
+
|
309 |
+
.annotation-key li.option {
|
310 |
+
border-radius: 13px;
|
311 |
+
cursor: pointer;
|
312 |
+
border: 1px solid #F3F3F3;
|
313 |
+
}
|
314 |
+
|
315 |
+
.annotation-key li.option.selected {
|
316 |
+
background-color: #F0F2F6;
|
317 |
+
}
|
318 |
+
|
319 |
+
.annotation-key-label {
|
320 |
+
margin: 0px;
|
321 |
+
padding-left: 0px !important;
|
322 |
+
padding-right: 0px !important;
|
323 |
+
}
|
324 |
+
|
325 |
+
.annotation-key-ngram {
|
326 |
+
border-bottom: 3px solid #66c2a5;
|
327 |
+
padding-bottom: 1px;
|
328 |
+
}
|
329 |
+
|
330 |
+
.annotation-key-semantic {
|
331 |
+
border-bottom: 4px dotted #66c2a5;
|
332 |
+
padding-bottom: 1px;
|
333 |
+
}
|
334 |
+
|
335 |
+
.annotation-key-novel {
|
336 |
+
font-weight: 500;
|
337 |
+
color: black;
|
338 |
+
}
|
339 |
+
|
340 |
+
.annotation-key-entity {
|
341 |
+
font-weight: 500;
|
342 |
+
color: #fb425c;
|
343 |
+
}
|
344 |
+
|
345 |
+
.annotation-key-stopword {
|
346 |
+
opacity: 70%;
|
347 |
+
}
|
resources/summvis.js
ADDED
@@ -0,0 +1,518 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
$(document).ready(
|
2 |
+
function () {
|
3 |
+
|
4 |
+
// Define global variables
|
5 |
+
|
6 |
+
let isDragging = false;
|
7 |
+
let saveDragPos;
|
8 |
+
|
9 |
+
let rtime;
|
10 |
+
let timeout = false;
|
11 |
+
let delta = 200;
|
12 |
+
|
13 |
+
let disableScrollEvent = false;
|
14 |
+
|
15 |
+
let annotateLexical = false;
|
16 |
+
let annotateSemantic = false;
|
17 |
+
let annotateNovel = false;
|
18 |
+
let annotateEntities = false;
|
19 |
+
|
20 |
+
// Define functions
|
21 |
+
|
22 |
+
function clamp(number, min, max) {
|
23 |
+
return Math.max(min, Math.min(number, max));
|
24 |
+
}
|
25 |
+
|
26 |
+
function hasScroll() {
|
27 |
+
const el = $(".display .main-doc");
|
28 |
+
return el.prop("scrollHeight") > el.prop("clientHeight");
|
29 |
+
}
|
30 |
+
|
31 |
+
function scrollBy(delta) {
|
32 |
+
const proxyDoc = $(".display .proxy-doc");
|
33 |
+
const proxyScroll = proxyDoc.find(".proxy-scroll");
|
34 |
+
const currentTop = parseFloat(proxyScroll.css("top"));
|
35 |
+
const newTop = clamp(currentTop + delta, 0, proxyDoc.innerHeight() - proxyScroll.innerHeight());
|
36 |
+
proxyScroll.css("top", newTop);
|
37 |
+
const mainDoc = $(".display .main-doc");
|
38 |
+
const scaleY = mainDoc[0].scrollHeight / proxyDoc.innerHeight();
|
39 |
+
mainDoc.scrollTop(newTop * scaleY)
|
40 |
+
}
|
41 |
+
|
42 |
+
function getSpanId(el) {
|
43 |
+
return getSpanIds(el)[0]
|
44 |
+
}
|
45 |
+
|
46 |
+
function getSpanIds(el) {
|
47 |
+
return el.attr("class").split(/\s+/).filter(function (x) {
|
48 |
+
return x.startsWith("span-")
|
49 |
+
});
|
50 |
+
}
|
51 |
+
|
52 |
+
function createProxy() {
|
53 |
+
const mainDoc = $(".display .main-doc");
|
54 |
+
const proxyDoc = $(".display .proxy-doc");
|
55 |
+
const proxyHeight = proxyDoc.innerHeight();
|
56 |
+
const proxyWidth = proxyDoc.innerWidth();
|
57 |
+
const scaleX = 0.8 * proxyWidth / mainDoc.innerWidth();
|
58 |
+
const scaleY = proxyHeight / mainDoc[0].scrollHeight;
|
59 |
+
const scrollTop = mainDoc.scrollTop();
|
60 |
+
const proxyScrollTop = scrollTop * scaleY;
|
61 |
+
const proxyScrollBottom = (scrollTop + mainDoc.innerHeight()) * scaleY;
|
62 |
+
const proxyScrollHeight = proxyScrollBottom - proxyScrollTop;
|
63 |
+
proxyDoc.empty();
|
64 |
+
|
65 |
+
// Loop through underlines in doc view and create associated proxy element
|
66 |
+
if (annotateLexical) {
|
67 |
+
$(".display .main-doc .token-underline").each(
|
68 |
+
function (index, value) {
|
69 |
+
const el = $(value);
|
70 |
+
const x = el.position().left;
|
71 |
+
const y = mainDoc.scrollTop() + el.position().top - mainDoc.position().top;
|
72 |
+
const newHeight = 3;
|
73 |
+
const color = el.css("border-bottom-color");
|
74 |
+
const proxyPadding = proxyDoc.innerWidth() - proxyDoc.width();
|
75 |
+
const newX = x * scaleX + proxyPadding / 2;
|
76 |
+
const newY = (y + el.height()) * scaleY - newHeight;
|
77 |
+
const newWidth = Math.min(
|
78 |
+
Math.max((el.width() * scaleX) + 1, 5),
|
79 |
+
proxyDoc.width() + proxyPadding / 2 - newX
|
80 |
+
);
|
81 |
+
|
82 |
+
let classes = "proxy-underline annotation-hidden " + getSpanIds(el).join(" ");
|
83 |
+
const proxyEl = $('<div/>', {
|
84 |
+
"class": classes,
|
85 |
+
"css": {
|
86 |
+
"position": "absolute",
|
87 |
+
"left": Math.round(newX),
|
88 |
+
"top": Math.round(newY),
|
89 |
+
"background-color": color,
|
90 |
+
"width": newWidth,
|
91 |
+
"height": newHeight,
|
92 |
+
}
|
93 |
+
}).appendTo(proxyDoc);
|
94 |
+
proxyEl.data(el.data());
|
95 |
+
}
|
96 |
+
);
|
97 |
+
}
|
98 |
+
|
99 |
+
// Loop through all active highlights in doc view and create associated proxy element
|
100 |
+
if (annotateSemantic) {
|
101 |
+
$(".display .main-doc .highlight").each(
|
102 |
+
function (index, value) {
|
103 |
+
const el = $(value);
|
104 |
+
const x = el.position().left;
|
105 |
+
const y = mainDoc.scrollTop() + el.position().top - mainDoc.position().top;
|
106 |
+
const newHeight = 5;
|
107 |
+
const color = el.css("background-color");
|
108 |
+
const proxyPadding = proxyDoc.innerWidth() - proxyDoc.width()
|
109 |
+
const newX = x * scaleX + proxyPadding / 2;
|
110 |
+
const newY = (y + el.height()) * scaleY - newHeight;
|
111 |
+
const newWidth = Math.min(
|
112 |
+
Math.max((el.width() * scaleX) + 1, 5),
|
113 |
+
proxyDoc.width() + proxyPadding / 2 - newX
|
114 |
+
);
|
115 |
+
const proxyEl = $('<div/>', {
|
116 |
+
"class": 'proxy-highlight annotation-hidden',
|
117 |
+
"css": {
|
118 |
+
"position": "absolute",
|
119 |
+
"left": Math.round(newX),
|
120 |
+
"top": Math.round(newY),
|
121 |
+
"background-color": color,
|
122 |
+
"width": newWidth,
|
123 |
+
"height": newHeight,
|
124 |
+
}
|
125 |
+
}).appendTo(proxyDoc);
|
126 |
+
// Copy data attributes
|
127 |
+
proxyEl.data(el.data());
|
128 |
+
// Set classes for matching
|
129 |
+
proxyEl.addClass(el.data("match-classes"))
|
130 |
+
}
|
131 |
+
);
|
132 |
+
}
|
133 |
+
$('<div/>', {
|
134 |
+
"class": 'proxy-scroll hidden',
|
135 |
+
"css": {
|
136 |
+
"top": proxyScrollTop,
|
137 |
+
"height": proxyScrollHeight,
|
138 |
+
}
|
139 |
+
}).appendTo(proxyDoc);
|
140 |
+
if (hasScroll()) {
|
141 |
+
$(".display .proxy-scroll").removeClass("hidden")
|
142 |
+
}
|
143 |
+
|
144 |
+
$(".display .proxy-doc")
|
145 |
+
.mousedown(function (event) {
|
146 |
+
saveDragPos = parseFloat(event.pageY);
|
147 |
+
isDragging = true;
|
148 |
+
event.preventDefault();
|
149 |
+
})
|
150 |
+
.mousemove(function (event) {
|
151 |
+
const dragPos = parseFloat(event.pageY);
|
152 |
+
if (isDragging) {
|
153 |
+
const distanceMoved = dragPos - saveDragPos;
|
154 |
+
scrollBy(distanceMoved);
|
155 |
+
saveDragPos = dragPos;
|
156 |
+
event.preventDefault();
|
157 |
+
}
|
158 |
+
})
|
159 |
+
.mouseup(function (event) {
|
160 |
+
isDragging = false;
|
161 |
+
})
|
162 |
+
.mouseenter(function () {
|
163 |
+
disableScrollEvent = true;
|
164 |
+
$(".display .proxy-scroll").addClass("hover")
|
165 |
+
})
|
166 |
+
.mouseleave(function () {
|
167 |
+
isDragging = false;
|
168 |
+
disableScrollEvent = false;
|
169 |
+
$(".display .proxy-scroll").removeClass("hover")
|
170 |
+
})
|
171 |
+
.on('wheel', function (event) {
|
172 |
+
scrollBy(event.originalEvent.deltaY / 4);
|
173 |
+
event.preventDefault();
|
174 |
+
});
|
175 |
+
|
176 |
+
// TODO: Handle user clicking in scroll region
|
177 |
+
|
178 |
+
$(".display .main-doc").scroll(function () {
|
179 |
+
if (disableScrollEvent) return;
|
180 |
+
$(".display .proxy-scroll")
|
181 |
+
.css(
|
182 |
+
"top", $(this).scrollTop() * scaleY
|
183 |
+
)
|
184 |
+
})
|
185 |
+
}
|
186 |
+
|
187 |
+
function resizeend() {
|
188 |
+
if (new Date() - rtime < delta) {
|
189 |
+
setTimeout(resizeend, delta);
|
190 |
+
} else {
|
191 |
+
timeout = false;
|
192 |
+
updateAnnotations();
|
193 |
+
toggleScrollbar();
|
194 |
+
}
|
195 |
+
}
|
196 |
+
|
197 |
+
function toggleScrollbar() {
|
198 |
+
if (hasScroll()) {
|
199 |
+
$(".display .proxy-scroll").removeClass("hidden");
|
200 |
+
} else {
|
201 |
+
$(".display .proxy-scroll").addClass("hidden");
|
202 |
+
}
|
203 |
+
}
|
204 |
+
|
205 |
+
function updateAnnotations() {
|
206 |
+
|
207 |
+
annotateSemantic = $("#option-semantic").hasClass("selected");
|
208 |
+
annotateLexical = $("#option-lexical").hasClass("selected");
|
209 |
+
annotateEntities = $("#option-entity").hasClass("selected");
|
210 |
+
annotateNovel = $("#option-novel").hasClass("selected");
|
211 |
+
|
212 |
+
if (annotateSemantic || annotateLexical) {
|
213 |
+
$(".summary-item").addClass("selectable")
|
214 |
+
} else {
|
215 |
+
$(".summary-item").removeClass("selectable")
|
216 |
+
}
|
217 |
+
|
218 |
+
if (annotateLexical) {
|
219 |
+
$(".underline").removeClass("annotation-hidden");
|
220 |
+
$(".summary-item").addClass("annotate-lexical");
|
221 |
+
} else {
|
222 |
+
$(".underline").addClass("annotation-hidden");
|
223 |
+
$(".summary-item").removeClass("annotate-lexical");
|
224 |
+
}
|
225 |
+
if (annotateSemantic) {
|
226 |
+
$(".highlight").removeClass("annotation-hidden");
|
227 |
+
} else {
|
228 |
+
$(".highlight").addClass("annotation-hidden");
|
229 |
+
}
|
230 |
+
if (annotateEntities) {
|
231 |
+
$(".summary-item").addClass("annotate-entities")
|
232 |
+
} else {
|
233 |
+
$(".summary-item").removeClass("annotate-entities")
|
234 |
+
}
|
235 |
+
if (annotateNovel) {
|
236 |
+
$(".summary-item").addClass("annotate-novel")
|
237 |
+
} else {
|
238 |
+
$(".summary-item").removeClass("annotate-novel")
|
239 |
+
}
|
240 |
+
|
241 |
+
createProxy();
|
242 |
+
|
243 |
+
if (annotateLexical) {
|
244 |
+
$(".proxy-underline").removeClass("annotation-hidden");
|
245 |
+
} else {
|
246 |
+
$(".proxy-underline").addClass("annotation-hidden");
|
247 |
+
}
|
248 |
+
if (annotateSemantic) {
|
249 |
+
$(".proxy-highlight").removeClass("annotation-hidden");
|
250 |
+
} else {
|
251 |
+
$(".proxy-highlight").addClass("annotation-hidden");
|
252 |
+
}
|
253 |
+
|
254 |
+
$(".summary-item .highlight").tooltip("disable");
|
255 |
+
if (annotateSemantic) {
|
256 |
+
$(".summary-item.selected .highlight").tooltip("enable")
|
257 |
+
}
|
258 |
+
}
|
259 |
+
|
260 |
+
function removeDocTooltips() {
|
261 |
+
$("[data-tooltip-timestamp]").tooltip("dispose").removeAttr("data-tooltip-timestamp");
|
262 |
+
}
|
263 |
+
|
264 |
+
function resetUnderlines() {
|
265 |
+
$('.annotation-invisible').removeClass("annotation-invisible");
|
266 |
+
$('.annotation-inactive').removeClass("annotation-inactive");
|
267 |
+
$('.temp-underline-color')
|
268 |
+
.each(function () {
|
269 |
+
$(this).css("border-color", $(this).data("primary-color"));
|
270 |
+
})
|
271 |
+
.removeClass("temp-underline-color")
|
272 |
+
$('.temp-proxy-underline-color')
|
273 |
+
.each(function () {
|
274 |
+
$(this).css("background-color", $(this).data("primary-color"));
|
275 |
+
})
|
276 |
+
.removeClass("temp-proxy-underline-color")
|
277 |
+
}
|
278 |
+
|
279 |
+
function showDocTooltip(el) {
|
280 |
+
const topDocHighlightId = $(el).data("top-doc-highlight-id");
|
281 |
+
const topDocSim = $(el).data("top-doc-sim");
|
282 |
+
const topHighlight = $(`.display .main-doc .highlight[data-highlight-id=${topDocHighlightId}]`);
|
283 |
+
if (!isViewable(topHighlight)) {
|
284 |
+
return;
|
285 |
+
}
|
286 |
+
topHighlight.tooltip({title: `Most similar (${topDocSim})`, trigger: "manual", container: "body"});
|
287 |
+
topHighlight.tooltip("show");
|
288 |
+
const tooltipTimestamp = Date.now();
|
289 |
+
// Do not use .data() method to set data attributes as they are not searchable
|
290 |
+
topHighlight.attr("data-tooltip-timestamp", tooltipTimestamp);
|
291 |
+
setTimeout(function () {
|
292 |
+
if (topHighlight.data("tooltip-timestamp") == tooltipTimestamp) {
|
293 |
+
topHighlight.tooltip("dispose").removeAttr("data-tooltip-timestamp");
|
294 |
+
}
|
295 |
+
}, 8000);
|
296 |
+
}
|
297 |
+
|
298 |
+
function highlightUnderlines() {
|
299 |
+
const spanId = getSpanId($(this));
|
300 |
+
const color = $(this).css("border-bottom-color");
|
301 |
+
// TODO Consolidate into single statement
|
302 |
+
$(`.summary-item.selected .underline.${spanId}`).removeClass("annotation-inactive");
|
303 |
+
$(`.doc .underline.${spanId}`)
|
304 |
+
.removeClass("annotation-inactive")
|
305 |
+
.each(function () {
|
306 |
+
$(this).css("border-bottom-color", color);
|
307 |
+
})
|
308 |
+
.addClass("temp-underline-color");
|
309 |
+
$(`.proxy-underline.${spanId}`)
|
310 |
+
.removeClass("annotation-inactive")
|
311 |
+
.each(function () {
|
312 |
+
$(this).css("background-color", color);
|
313 |
+
})
|
314 |
+
.addClass("temp-proxy-underline-color");
|
315 |
+
|
316 |
+
$(`.summary-item.selected .underline:not(.${spanId})`).addClass("annotation-inactive");
|
317 |
+
$(`.doc .underline:not(.${spanId})`).addClass("annotation-inactive");
|
318 |
+
$(`.proxy-underline:not(.${spanId})`).addClass("annotation-inactive");
|
319 |
+
|
320 |
+
$(".summary-item.selected .highlight:not(.annotation-hidden)").addClass("annotation-inactive");
|
321 |
+
}
|
322 |
+
|
323 |
+
function resetHighlights() {
|
324 |
+
removeDocTooltips();
|
325 |
+
$('.summary-item.selected .annotation-inactive').removeClass("annotation-inactive");
|
326 |
+
$('.summary-item.selected .annotation-invisible').removeClass("annotation-invisible");
|
327 |
+
$('.temp-highlight-color')
|
328 |
+
.each(function () {
|
329 |
+
$(this).css("background-color", $(this).data("primary-color"));
|
330 |
+
})
|
331 |
+
.removeClass("temp-highlight-color");
|
332 |
+
$('.highlight.selected').removeClass("selected");
|
333 |
+
$('.proxy-highlight.selected').removeClass("selected");
|
334 |
+
$('.summary-item [title]').removeAttr("title");
|
335 |
+
}
|
336 |
+
|
337 |
+
function highlightToken() {
|
338 |
+
const highlightId = $(this).data("highlight-id");
|
339 |
+
$(`.summary-item.selected .highlight:not(.summary-highlight-${highlightId})`).addClass("annotation-inactive");
|
340 |
+
$('.highlight.selected').removeClass("selected")
|
341 |
+
$('.proxy-highlight.selected').removeClass("selected")
|
342 |
+
const matchedDocHighlight = `.display .main-doc .summary-highlight-${highlightId}`;
|
343 |
+
const matchedProxyHighlight = `.proxy-doc .summary-highlight-${highlightId}`;
|
344 |
+
$(matchedDocHighlight + ", " + matchedProxyHighlight)
|
345 |
+
.each(function () {
|
346 |
+
const newHighlightColor = $(this).data(`color-${highlightId}`);
|
347 |
+
$(this).css("background-color", newHighlightColor);
|
348 |
+
$(this).addClass("selected");
|
349 |
+
})
|
350 |
+
.addClass("temp-highlight-color");
|
351 |
+
$(".underline").addClass("annotation-inactive");
|
352 |
+
$(".proxy-underline").addClass("annotation-invisible")
|
353 |
+
showDocTooltip(this);
|
354 |
+
$(this).addClass("selected");
|
355 |
+
$(this).removeClass("annotation-inactive");
|
356 |
+
$('.summary-item [title]').removeAttr("title");
|
357 |
+
if (!isViewable($(matchedDocHighlight))) {
|
358 |
+
$(this).attr("title", "Click to scroll to most similar word.")
|
359 |
+
}
|
360 |
+
}
|
361 |
+
|
362 |
+
function isViewable(el) {
|
363 |
+
const elTop = el.offset().top;
|
364 |
+
const elBottom = elTop + el.outerHeight();
|
365 |
+
const scrollRegion = $(".display .main-doc");
|
366 |
+
const scrollTop = scrollRegion.offset().top;
|
367 |
+
const scrollBottom = scrollTop + scrollRegion.outerHeight();
|
368 |
+
return elTop > scrollTop && elBottom < scrollBottom;
|
369 |
+
}
|
370 |
+
|
371 |
+
// Initialization
|
372 |
+
|
373 |
+
$(function () {
|
374 |
+
$('[data-toggle="tooltip"]').tooltip({
|
375 |
+
// 'boundary': '.summary-container'
|
376 |
+
trigger: 'hover'
|
377 |
+
})
|
378 |
+
})
|
379 |
+
updateAnnotations();
|
380 |
+
|
381 |
+
// Bind events
|
382 |
+
|
383 |
+
$(window).resize(function () {
|
384 |
+
rtime = new Date();
|
385 |
+
if (timeout === false) {
|
386 |
+
timeout = true;
|
387 |
+
setTimeout(resizeend, delta);
|
388 |
+
}
|
389 |
+
});
|
390 |
+
|
391 |
+
$(".summary-list").on(
|
392 |
+
"click",
|
393 |
+
".summary-item.selectable:not(.selected)",
|
394 |
+
function () {
|
395 |
+
const summary_index = $(this).data("index");
|
396 |
+
|
397 |
+
// Update summary items
|
398 |
+
$(".summary-item.selected").removeClass("selected")
|
399 |
+
$(this).addClass("selected")
|
400 |
+
|
401 |
+
// Update doc
|
402 |
+
// Show the version of document aligned with selected summary index
|
403 |
+
$(`.doc[data-index=${summary_index}]`).removeClass("nodisplay").addClass("display");
|
404 |
+
// Hide the version of document not aligned with selected summary index
|
405 |
+
$(`.doc[data-index!=${summary_index}]`).removeClass("display").addClass("nodisplay");
|
406 |
+
|
407 |
+
updateAnnotations();
|
408 |
+
}
|
409 |
+
);
|
410 |
+
|
411 |
+
$("#option-lexical").click(function () {
|
412 |
+
$(this).toggleClass("selected")
|
413 |
+
updateAnnotations()
|
414 |
+
});
|
415 |
+
$("#option-semantic").click(function () {
|
416 |
+
$(this).toggleClass("selected")
|
417 |
+
updateAnnotations()
|
418 |
+
});
|
419 |
+
$("#option-novel").click(function () {
|
420 |
+
$(this).toggleClass("selected")
|
421 |
+
updateAnnotations()
|
422 |
+
});
|
423 |
+
$("#option-entity").click(function () {
|
424 |
+
$(this).toggleClass("selected")
|
425 |
+
updateAnnotations()
|
426 |
+
});
|
427 |
+
|
428 |
+
const activeUnderlines = ".summary-item.selected .underline:not(.annotation-inactive):not(.annotation-hidden)";
|
429 |
+
$(".summary-list").on(
|
430 |
+
"mouseenter",
|
431 |
+
activeUnderlines,
|
432 |
+
function () {
|
433 |
+
highlightUnderlines.call(this);
|
434 |
+
}
|
435 |
+
);
|
436 |
+
|
437 |
+
$(".summary-list").on(
|
438 |
+
"mouseleave",
|
439 |
+
activeUnderlines,
|
440 |
+
resetUnderlines
|
441 |
+
);
|
442 |
+
$(".summary-list").on(
|
443 |
+
"click",
|
444 |
+
activeUnderlines,
|
445 |
+
function () {
|
446 |
+
// Find aligned underline in doc and scroll doc to that position
|
447 |
+
highlightUnderlines.call(this);
|
448 |
+
const mainDoc = $(".display .main-doc");
|
449 |
+
const spanId = getSpanId($(this));
|
450 |
+
const matchedUnderline = $(`.doc .underline.${spanId}`);
|
451 |
+
mainDoc.animate({
|
452 |
+
scrollTop: mainDoc.scrollTop() +
|
453 |
+
matchedUnderline.offset().top - mainDoc.offset().top - 60
|
454 |
+
},
|
455 |
+
300
|
456 |
+
)
|
457 |
+
}
|
458 |
+
);
|
459 |
+
|
460 |
+
const activeHighlights = ".summary-item.selected .highlight:not(.annotation-hidden):not(.matches-ngram), " +
|
461 |
+
".summary-item.selected:not(.annotate-lexical) .highlight:not(.annotation-hidden)";
|
462 |
+
$(".summary-list").on(
|
463 |
+
"mouseenter",
|
464 |
+
activeHighlights,
|
465 |
+
function () {
|
466 |
+
highlightToken.call(this);
|
467 |
+
})
|
468 |
+
$(".summary-list").on(
|
469 |
+
"mouseleave",
|
470 |
+
activeHighlights,
|
471 |
+
function () {
|
472 |
+
resetHighlights();
|
473 |
+
resetUnderlines();
|
474 |
+
}
|
475 |
+
);
|
476 |
+
$(".summary-list").on(
|
477 |
+
"click",
|
478 |
+
activeHighlights,
|
479 |
+
function () {
|
480 |
+
highlightToken.call(this);
|
481 |
+
// Find corresponding highlight in doc representing max similarity and scroll doc to that position
|
482 |
+
const topDocHighlightId = $(this).data("top-doc-highlight-id");
|
483 |
+
removeDocTooltips(topDocHighlightId);
|
484 |
+
const topDocHighlight = $(`.display .main-doc .highlight[data-highlight-id=${topDocHighlightId}]`);
|
485 |
+
const mainDoc = $(".display .main-doc");
|
486 |
+
const el = this;
|
487 |
+
mainDoc.animate({
|
488 |
+
scrollTop: mainDoc.scrollTop() +
|
489 |
+
topDocHighlight.offset().top - mainDoc.offset().top - 60
|
490 |
+
},
|
491 |
+
300,
|
492 |
+
function () {
|
493 |
+
setTimeout(
|
494 |
+
function () {
|
495 |
+
// If no other tooltips have since been displayed
|
496 |
+
if ($("[data-tooltip-timestamp]").length == 0) {
|
497 |
+
showDocTooltip(el);
|
498 |
+
} else {
|
499 |
+
console.log("Not showing tooltip because one already exists")
|
500 |
+
}
|
501 |
+
},
|
502 |
+
100
|
503 |
+
)
|
504 |
+
}
|
505 |
+
)
|
506 |
+
}
|
507 |
+
);
|
508 |
+
$(".summary-list").on(
|
509 |
+
"mouseleave",
|
510 |
+
".summary-item.selected .content",
|
511 |
+
function () {
|
512 |
+
resetHighlights();
|
513 |
+
resetUnderlines();
|
514 |
+
},
|
515 |
+
);
|
516 |
+
}
|
517 |
+
);
|
518 |
+
|
utils.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
|
4 |
+
def preprocess_text(text):
|
5 |
+
split_punct = re.escape(r'()')
|
6 |
+
return ' '.join(re.findall(rf"[^\s{split_punct}]+|[{split_punct}]", text))
|
website/annotations.png
ADDED
![]() |
website/demo.gif
ADDED
![]() |
website/main-vis.jpg
ADDED
![]() |
website/title.png
ADDED
![]() |
website/triangle.png
ADDED
![]() |