sentence_transformers_support (#2)
Browse files- Add support for Sentence Transformer (fd892c44ece765cd2eea98f34a84a52cda185d3c)
- Update README.md (2a36dcec5365886f4487325e059223f9b5c65e0b)
- 1_SpladePooling/config.json +5 -0
- README.md +44 -4
- config_sentence_transformers.json +14 -0
- modules.json +14 -0
- sentence_bert_config.json +4 -0
1_SpladePooling/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"pooling_strategy": "max",
|
3 |
+
"activation_function": "relu",
|
4 |
+
"word_embedding_dimension": null
|
5 |
+
}
|
README.md
CHANGED
@@ -12,9 +12,12 @@ tags:
|
|
12 |
- passage-retrieval
|
13 |
- knowledge-distillation
|
14 |
- document encoder
|
|
|
|
|
|
|
15 |
pretty_name: Independent Implementation of SPLADE++ Model with some efficiency tweaks for Industry setting.
|
16 |
-
library_name: transformers
|
17 |
-
pipeline_tag:
|
18 |
---
|
19 |
|
20 |
<center>
|
@@ -198,9 +201,46 @@ sparse_rep = expander.expand(
|
|
198 |
["The Manhattan Project and its atomic bomb helped bring an end to World War II. Its legacy of peaceful uses of atomic energy continues to have an impact on history and science."])
|
199 |
```
|
200 |
|
201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
|
203 |
-
##
|
204 |
|
205 |
**NOTEBOOK user? Login first**
|
206 |
|
|
|
12 |
- passage-retrieval
|
13 |
- knowledge-distillation
|
14 |
- document encoder
|
15 |
+
- sparse-encoder
|
16 |
+
- sparse
|
17 |
+
- splade
|
18 |
pretty_name: Independent Implementation of SPLADE++ Model with some efficiency tweaks for Industry setting.
|
19 |
+
library_name: sentence-transformers
|
20 |
+
pipeline_tag: feature-extraction
|
21 |
---
|
22 |
|
23 |
<center>
|
|
|
201 |
["The Manhattan Project and its atomic bomb helped bring an end to World War II. Its legacy of peaceful uses of atomic energy continues to have an impact on history and science."])
|
202 |
```
|
203 |
|
204 |
+
## 6c. With Sentence Transformers
|
205 |
+
|
206 |
+
First install the Sentence Transformers library:
|
207 |
+
|
208 |
+
```bash
|
209 |
+
pip install -U sentence-transformers
|
210 |
+
```
|
211 |
+
|
212 |
+
Then you can load this model and run inference.
|
213 |
+
```python
|
214 |
+
from sentence_transformers import SparseEncoder
|
215 |
+
|
216 |
+
# Download from the 🤗 Hub
|
217 |
+
model = SparseEncoder("prithivida/Splade_PP_en_v2")
|
218 |
+
# Run inference
|
219 |
+
sentence = [
|
220 |
+
"The Manhattan Project and its atomic bomb helped bring an end to World War II. Its legacy of peaceful uses of atomic energy continues to have an impact on history and science."
|
221 |
+
]
|
222 |
+
embeddings = model.encode(sentence)
|
223 |
+
print(embeddings.shape)
|
224 |
+
# [1, 30522]
|
225 |
+
|
226 |
+
decoded_sentence = model.decode(embeddings[0])
|
227 |
+
print(f"Number of actual dimensions: {len(decoded_sentence)}")
|
228 |
+
decoded_sentence_rounded = [(token, round(score, 2)) for token, score in decoded_sentence]
|
229 |
+
print("SPLADE BOW rep:\n", decoded_sentence_rounded)
|
230 |
+
|
231 |
+
# Number of actual dimensions: 103
|
232 |
+
# SPLADE BOW rep:
|
233 |
+
# [('manhattan', 2.59), ('project', 2.1), ('atomic', 1.65), ('legacy', 1.62), ('bomb', 1.5), ('peaceful', 1.47), ('end', 1.42), ('helped', 1.37), ('wwii', 1.36), ('energy', 1.36), ('war', 1.29), ('1942', 1.29), ('bring', 1.21), ('impact', 1.14),
|
234 |
+
# ('help', 1.09), ('bombs', 1.05), ('ny', 0.93), ('scientist', 0.91), ('nuclear', 0.89), ('history', 0.87), ('projects', 0.87), ('mission', 0.83), ('stop', 0.77), ('wars', 0.76), ('peace', 0.76), ('ii', 0.76), ('affect', 0.76), ('power', 0.73),
|
235 |
+
# ('science', 0.72), ('bombing', 0.72), ('atom', 0.72), ('use', 0.7), ('did', 0.69), ('brought', 0.67), ('still', 0.66), ('purpose', 0.65), ('was', 0.65), ('effect', 0.59), ('scientists', 0.59), ('uses', 0.57), ('because', 0.53), ('historical', 0.48),
|
236 |
+
# ('experiment', 0.47), ('scientific', 0.47), ('safe', 0.46), ('w', 0.45), ('message', 0.44), ('##w', 0.42), ('ended', 0.41), ('hudson', 0.39), ('roosevelt', 0.38), ('were', 0.36), ('##nik', 0.35), ('continue', 0.34), ('hiroshima', 0.33), ('important', 0.33),
|
237 |
+
# ('benefit', 0.32), ('destruction', 0.31), ('used', 0.3), ('nazi', 0.3), ('destroyed', 0.29), ('story', 0.29), ('assisted', 0.27), ('close', 0.27), ('influenced', 0.25), ('world', 0.25), ('invented', 0.24), ('contribution', 0.24), ('military', 0.24), ('conflict', 0.22),
|
238 |
+
# ('1939', 0.22), ('success', 0.22), ('1940s', 0.21), ('nasa', 0.2), ('harry', 0.2), ('revolution', 0.2), ('today', 0.18), ('rescue', 0.17), ('radiation', 0.16), ('destiny', 0.16), ('last', 0.15), ('allies', 0.14), ('the', 0.14), ('created', 0.13), ('hess', 0.13), ('weapon', 0.13),
|
239 |
+
# ('started', 0.11), ('us', 0.1), ('secret', 0.1), ('campaign', 0.09), ('2', 0.08), ('cause', 0.08), ('and', 0.07), ('propaganda', 0.06), ('noah', 0.05), ('theory', 0.05), ('significance', 0.02), ('berlin', 0.01), ('fuel', 0.01), ('columbia', 0.01), ('strategy', 0.01), ('usage', 0.01), ('symbol', 0.0)]
|
240 |
+
|
241 |
+
```
|
242 |
|
243 |
+
## 6d. With HuggingFace
|
244 |
|
245 |
**NOTEBOOK user? Login first**
|
246 |
|
config_sentence_transformers.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_type": "SparseEncoder",
|
3 |
+
"__version__": {
|
4 |
+
"sentence_transformers": "5.0.0",
|
5 |
+
"transformers": "4.50.3",
|
6 |
+
"pytorch": "2.6.0+cu124"
|
7 |
+
},
|
8 |
+
"prompts": {
|
9 |
+
"query": "",
|
10 |
+
"document": ""
|
11 |
+
},
|
12 |
+
"default_prompt_name": null,
|
13 |
+
"similarity_fn_name": "dot"
|
14 |
+
}
|
modules.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.sparse_encoder.models.MLMTransformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_SpladePooling",
|
12 |
+
"type": "sentence_transformers.sparse_encoder.models.SpladePooling"
|
13 |
+
}
|
14 |
+
]
|
sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 512,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|