Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +35 -0
- DeepSeek-Coder-main/.DS_Store +0 -0
- DeepSeek-Coder-main/.gitignore +3 -0
- DeepSeek-Coder-main/Evaluation/DS-1000/README.md +29 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/README.md +74 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/__pycache__/humaneval.cpython-38.pyc +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-cpp +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-cpp.jsonl +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-cs +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-cs-bu.jsonl +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-cs.jsonl +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-d.jsonl +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-go.jsonl +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-java +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-java.jsonl +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-jl.jsonl +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-js.jsonl +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-lua.jsonl +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-php +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-php.jsonl +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-pl.jsonl +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-python.jsonl +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-r.jsonl +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-rb.jsonl +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-rkt.jsonl +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-rs.jsonl +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-scala.jsonl +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-sh +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-sh.jsonl +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-swift.jsonl +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-ts +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-ts.jsonl +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/eval.sh +4 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/eval_instruct.py +129 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/eval_pal.py +42 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/human_eval/__init__.py +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/human_eval/data.py +49 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/human_eval/evaluate_functional_correctness.py +29 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/human_eval/evaluation.py +296 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/human_eval/execution.py +731 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/humaneval.py +163 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/javatuples-1.2.jar +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/test_config.yaml +15 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/utils/__pycache__/dataset.cpython-38.pyc +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/utils/__pycache__/instruct.cpython-38.pyc +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/utils/__pycache__/utils.cpython-38.pyc +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/utils/__pycache__/utils.cpython-39.pyc +0 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/utils/dataset.py +61 -0
- DeepSeek-Coder-main/Evaluation/HumanEval/utils/utils.py +146 -0
- DeepSeek-Coder-main/Evaluation/LeetCode/data/20240121-Jul-zh.jsonl +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,38 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
DeepSeek-Coder-main/Evaluation/PAL-Math/datasets/tabmwp/test.json filter=lfs diff=lfs merge=lfs -text
|
37 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/numpy/_core/_multiarray_umath.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
|
38 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pandas/_libs/algos.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
|
39 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pandas/_libs/groupby.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
|
40 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pandas/_libs/hashtable.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
|
41 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pandas/_libs/interval.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
|
42 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pandas/_libs/join.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
|
43 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pandas/_libs/tslibs/offsets.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
|
44 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/_compute.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
|
45 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/_flight.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
|
46 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/lib.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
|
47 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/libarrow.1900.dylib filter=lfs diff=lfs merge=lfs -text
|
48 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/libarrow_acero.1900.dylib filter=lfs diff=lfs merge=lfs -text
|
49 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/libarrow_dataset.1900.dylib filter=lfs diff=lfs merge=lfs -text
|
50 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/libarrow_flight.1900.dylib filter=lfs diff=lfs merge=lfs -text
|
51 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/libarrow_python.1900.0.0.dylib filter=lfs diff=lfs merge=lfs -text
|
52 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/libarrow_python.1900.dylib filter=lfs diff=lfs merge=lfs -text
|
53 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/libarrow_python.dylib filter=lfs diff=lfs merge=lfs -text
|
54 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/libarrow_substrait.1900.dylib filter=lfs diff=lfs merge=lfs -text
|
55 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/libparquet.1900.dylib filter=lfs diff=lfs merge=lfs -text
|
56 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pydantic_core/_pydantic_core.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
|
57 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/sympy/polys/benchmarks/__pycache__/bench_solvers.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text
|
58 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/tokenizers/tokenizers.abi3.so filter=lfs diff=lfs merge=lfs -text
|
59 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/torch/bin/protoc filter=lfs diff=lfs merge=lfs -text
|
60 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/torch/bin/protoc-3.13.0.0 filter=lfs diff=lfs merge=lfs -text
|
61 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/torch/lib/libtorch_cpu.dylib filter=lfs diff=lfs merge=lfs -text
|
62 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/torch/lib/libtorch_python.dylib filter=lfs diff=lfs merge=lfs -text
|
63 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/wandb/bin/gpu_stats filter=lfs diff=lfs merge=lfs -text
|
64 |
+
DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/wandb/bin/wandb-core filter=lfs diff=lfs merge=lfs -text
|
65 |
+
DeepSeek-Coder-main/pictures/home.tif filter=lfs diff=lfs merge=lfs -text
|
66 |
+
venv/lib/python3.12/site-packages/numpy/_core/_multiarray_umath.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
|
67 |
+
venv/lib/python3.12/site-packages/pydantic_core/_pydantic_core.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
|
68 |
+
venv/lib/python3.12/site-packages/tiktoken/_tiktoken.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
|
69 |
+
venv/lib/python3.12/site-packages/wandb/bin/gpu_stats filter=lfs diff=lfs merge=lfs -text
|
70 |
+
venv/lib/python3.12/site-packages/wandb/bin/wandb-core filter=lfs diff=lfs merge=lfs -text
|
DeepSeek-Coder-main/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
DeepSeek-Coder-main/.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
Evaluation/MBPP/eval_instruct.sh
|
3 |
+
Evaluation/LeetCode/output/
|
DeepSeek-Coder-main/Evaluation/DS-1000/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## 1. Introduction
|
2 |
+
|
3 |
+
We provide a test script to evaluate the performance of the **deepseek-coder** model on code completion benchmarks. We select the widely-used benchmarks: [**DS-1000**](https://github.com/xlang-ai/DS-1000).
|
4 |
+
|
5 |
+
## 2. Evaluation
|
6 |
+
|
7 |
+
We directly use the scripts provided by the DS-1000 repository to evaluate the performance of the models. You can refer to [**DS-1000**](https://github.com/xlang-ai/DS-1000) to find more details about the evaluation.
|
8 |
+
|
9 |
+
|
10 |
+
## 3. Experimental Results
|
11 |
+
|
12 |
+
We report experimental results here for the completion mode of DS-1000. We set the maximum length to **2048**, and employ the **greedy search strategy**. To ensure a fair comparison, we apply identical hyper-parameters across all open-source models under evaluation.
|
13 |
+
|
14 |
+
| Model | Size | Matplotlib | Numpy | Pandas | Pytorch | Scipy | Scikit-Learn | Tensorflow | Avg |
|
15 |
+
|------------------------|------|------------|-------|--------|---------|-------|-------------|------------|-------|
|
16 |
+
| Codex-001 | - | 41.8% | 26.6% | 9.4% | 9.7% | 15.0% | 18.5% | 17.2% | 20.2% |
|
17 |
+
| Codex-002 | - | **57.0%** | 43.1% | **26.5%** | **41.8%** | 31.8% | **44.8%** | 39.3% | 39.2% |
|
18 |
+
| CodeShell | 7B | 34.1% | 21.8% | 10.7% | 11.8% | 17.0% | 20.0% | 15.6% | 18.8% |
|
19 |
+
| CodeGeeX2 | 6B | 38.7% | 26.8% | 14.4% | 11.8% | 19.8% | 27.0% | 17.8% | 22.9% |
|
20 |
+
| StarCoder | 16B | 47.7% | 31.4% | 12.7% | 25% | 22.6% | 35.7% | 22.2% | 27.2% |
|
21 |
+
| CodeLLama-Base | 7B | 41.9% | 24.6% | 14.8% | 16.2% | 18.9% | 17.4% | 17.8% | 22.1% |
|
22 |
+
| CodeLLama-Base | 13B | 46.5% | 28.6% | 18.2% | 19.1% | 18.9% | 27.8% | 33.3% | 26.8% |
|
23 |
+
| CodeLLama-Base | 34B | 50.3% | 42.7% | 23.0% | 25.0% | 28.3% | 33.9% | 40.0% | 34.3% |
|
24 |
+
| | | | | | | | | | | |
|
25 |
+
| DeepSeek-Coder-Base | 1.3B | 32.3% | 21.4% | 9.3% | 8.8% | 8.5% | 16.5% | 8.9% | 16.2% |
|
26 |
+
| DeepSeek-Coder-Base | 5.7B | 51.1% | 31.8% | 19.9% | 14.7% | 17.0% | 29.6% | 15.6% | 27.7% |
|
27 |
+
| DeepSeek-Coder-Base | 6.7B | 48.4% | 35.5% | 20.6% | 19.1% | 22.6% | 38.3% | 24.4% | 30.5% |
|
28 |
+
| DeepSeek-Coder-Base | 33B | 56.1% | **49.6%** | 25.8% | 36.8% | **36.8%** | 40.0% | **46.7%** | **40.2%** |
|
29 |
+
|
DeepSeek-Coder-main/Evaluation/HumanEval/README.md
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## 1. Introduction
|
2 |
+
|
3 |
+
We provide a test script to evaluate the performance of the **deepseek-coder** model on code generation benchmarks. We select the widely-used benchmarks: **[HumanEval-Python](https://huggingface.co/datasets/openai_humaneval), [HumanEval-Multilingual](https://huggingface.co/datasets/nuprl/MultiPL-E)**.
|
4 |
+
|
5 |
+
|
6 |
+
|
7 |
+
## 2. Setup
|
8 |
+
|
9 |
+
```
|
10 |
+
pip install accelerate
|
11 |
+
pip install attrdict
|
12 |
+
pip install transformers
|
13 |
+
pip install pytorch
|
14 |
+
```
|
15 |
+
|
16 |
+
|
17 |
+
## 3. Evaluation
|
18 |
+
|
19 |
+
We've created a sample script, **eval.sh**, that demonstrates how to test the **DeepSeek-Coder-1.3b-Base** model on the HumanEval dataset leveraging **8** GPUs. If your use case involves a different model or dataset, simply adjust the script to fit your needs.
|
20 |
+
|
21 |
+
Additionally, for various programming languages, the execution path may differ. Please ensure you update the appropriate paths in the **humaneval/execution.py** file accordingly.
|
22 |
+
|
23 |
+
```bash
|
24 |
+
MODEL_NAME_OR_PATH="deepseek-ai/deepseek-coder-1.3b-base"
|
25 |
+
DATASET_ROOT="data/"
|
26 |
+
LANGUAGE="python"
|
27 |
+
python -m accelerate.commands.launch --config_file test_config.yaml eval_pal.py --logdir ${MODEL_NAME_OR_PATH} --language ${LANGUAGE} --dataroot ${DATASET_ROOT}
|
28 |
+
```
|
29 |
+
|
30 |
+
To evaluate the instruction-based model, please follow the script below:
|
31 |
+
```bash
|
32 |
+
LANG="python"
|
33 |
+
OUPUT_DIR="output"
|
34 |
+
MODEL="deepseek-coder-33b-instruct"
|
35 |
+
|
36 |
+
CUDA_VISIBLE_DEVICES=0,1 python eval_instruct.py \
|
37 |
+
--model "deepseek-ai/$MODEL" \
|
38 |
+
--output_path "$OUPUT_DIR/${LANG}.$MODEL.jsonl" \
|
39 |
+
--language $LANG \
|
40 |
+
--temp_dir $OUPUT_DIR
|
41 |
+
```
|
42 |
+
|
43 |
+
## 4. Experimental Results
|
44 |
+
|
45 |
+
We report experimental results here for 8 main-stream programming languages, **python**, **c++**, **java**, **PHP**, **TypeScript**, **C#**, **Bash**, and **JavaScript**. For all open-source models, we utilize this repository to obtain the performance of the models on the HumanEval dataset. We set the maximum input length to **4096** and the maximum output length to **500**, and employ the **greedy search strategy**.
|
46 |
+
|
47 |
+
|
48 |
+
#### (1) Multilingual Base Models
|
49 |
+
|
50 |
+
| Model | Size | Python | C++ | Java | PHP | TS | C# | Bash | JS | Avg |
|
51 |
+
|-------------------|------|--------|-------|------|------|------|------|------|------|------|
|
52 |
+
| code-cushman-001 | 12B | 33.5% | 31.9% | 30.6%| 28.9%| 31.3%| 22.1%| 11.7%| - | - |
|
53 |
+
| CodeShell | 7B | 35.4% | 32.9% | 34.2%| 31.7%| 30.2%| 38.0%| 7.0% | 33.5%| 30.4%|
|
54 |
+
| CodeGeeX2 | 6B | 36.0% | 29.2% | 25.9%| 23.6%| 20.8%| 29.7%| 6.3% | 24.8%| 24.5%|
|
55 |
+
| StarCoderBase | 16B | 31.7% | 31.1% | 28.5%| 25.4%| 34.0%| 34.8%| 8.9% | 29.8%| 28.0%|
|
56 |
+
| CodeLLama | 7B | 31.7% | 29.8% | 34.2%| 23.6%| 36.5%| 36.7%| 12.0%| 29.2%| 29.2%|
|
57 |
+
| CodeLLama | 13B | 36.0% | 37.9% | 38.0%| 34.2%| 45.2%| 43.0%| 16.5%| 32.3%| 35.4%|
|
58 |
+
| CodeLLama | 34B | 48.2% | 44.7% | 44.9%| 41.0%| 42.1%| 48.7%| 15.8%| 42.2%| 41.0%|
|
59 |
+
| | | | | | | | | | | |
|
60 |
+
| DeepSeek-Coder-Base| 1.3B | 34.8% | 31.1% | 32.3%| 24.2%| 28.9%| 36.7%| 10.1%| 28.6%| 28.3%|
|
61 |
+
| DeepSeek-Coder-Base| 5.7B | 48.7% | 45.3% | 41.1%| 39.7%| 44.7%| 41.1%| 27.8%| 42.2%| 41.3%|
|
62 |
+
| DeepSeek-Coder-Base| 6.7B | 49.4% | 50.3% | 43.0%| 38.5%| 49.7%| 50.0%| 28.5%| 48.4%| 44.7%|
|
63 |
+
| DeepSeek-Coder-Base|33B | **56.1%** | **58.4%** | **51.9%**| **44.1%**| **52.8%**| **51.3%**| **32.3%**| **55.3%**| **50.3%**|
|
64 |
+
|
65 |
+
#### (2) Instruction-Tuned Models
|
66 |
+
| Model | Size | Python | C++ | Java | PHP | TS | C# | Bash | JS | Avg |
|
67 |
+
|---------------------|------|--------|-------|------|------|------|------|------|------|------|
|
68 |
+
| GPT-3.5-Turbo | - | 76.2% | 63.4% | 69.2%| 60.9%| 69.1%| 70.8%| 42.4%| 67.1%| 64.9%|
|
69 |
+
| GPT-4 | - | **84.1%** | **76.4%** | **81.6%**| **77.2%**| **77.4%**| **79.1%**| **58.2%**| **78.0%**| **76.5%**|
|
70 |
+
| | | | | | | | | | | |
|
71 |
+
| DeepSeek-Coder-Instruct | 1.3B | 65.2% | 45.3% | 51.9% | 45.3% | 59.7% |55.1% | 12.7% | 52.2% | 48.4% |
|
72 |
+
| DeepSeek-Coder-Instruct | 6.7B | 78.9% | 63.4% | 68.4% | 68.9%| 67.2%| 72.8%| 36.7%| 72.7%| 66.1%|
|
73 |
+
| DeepSeek-Coder-Instruct | 33B | **79.3%** | **68.9%** | **73.4%** | **72.7%**| **67.9%**| **74.1%**| **43.0%**| **73.9%**| **69.2%**|
|
74 |
+
|
DeepSeek-Coder-main/Evaluation/HumanEval/__pycache__/humaneval.cpython-38.pyc
ADDED
Binary file (5.38 kB). View file
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-cpp
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-cpp.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-cs
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-cs-bu.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-cs.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-d.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-go.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-java
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-java.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-jl.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-js.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-lua.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-php
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-php.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-pl.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-python.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-r.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-rb.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-rkt.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-rs.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-scala.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-sh
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-sh.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-swift.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-ts
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-ts.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/eval.sh
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL_NAME_OR_PATH="deepseek/deepseek-coder-1b"
|
2 |
+
DATASET_ROOT="data/"
|
3 |
+
LANGUAGE="python"
|
4 |
+
CUDA_VISIBLE_DEVICES=1,2,3 python -m accelerate.commands.launch --config_file test_config.yaml eval_pal.py --logdir ${MODEL_NAME_OR_PATH} --language ${LANGUAGE} --dataroot ${DATASET_ROOT}
|
DeepSeek-Coder-main/Evaluation/HumanEval/eval_instruct.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import torch
|
5 |
+
from pathlib import Path
|
6 |
+
from tqdm import tqdm
|
7 |
+
|
8 |
+
data_abs_dir = Path(__file__).parent / "data"
|
9 |
+
|
10 |
+
from utils.utils import extract_generation_code, languge_settings
|
11 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
12 |
+
from human_eval.evaluation import evaluate_functional_correctness
|
13 |
+
|
14 |
+
def build_deepseekcoder_instruction(languge: str, question: str):
|
15 |
+
return '''
|
16 |
+
Please continue to complete the function. You are not allowed to modify the given code and do the completion only. Please return all completed function in a codeblock. Here is the given code to do completion:
|
17 |
+
```{}
|
18 |
+
{}
|
19 |
+
```
|
20 |
+
'''.strip().format(languge.lower(), question.strip())
|
21 |
+
|
22 |
+
def generate_one(example, lang, tokenizer, model):
|
23 |
+
prompt = build_deepseekcoder_instruction(languge_settings[lang]['full_name'], example['prompt'])
|
24 |
+
inputs = tokenizer.apply_chat_template(
|
25 |
+
[{'role': 'user', 'content': prompt }],
|
26 |
+
return_tensors="pt",
|
27 |
+
add_generation_prompt=True
|
28 |
+
).to(model.device)
|
29 |
+
|
30 |
+
stop_id = tokenizer.convert_tokens_to_ids("<|EOT|>")
|
31 |
+
assert isinstance(stop_id, int), "Invalid tokenizer, EOT id not found"
|
32 |
+
|
33 |
+
outputs = model.generate(
|
34 |
+
inputs,
|
35 |
+
max_new_tokens=1024,
|
36 |
+
do_sample=False,
|
37 |
+
# top_p=0.95,
|
38 |
+
# temperature=temperature,
|
39 |
+
pad_token_id=stop_id,
|
40 |
+
eos_token_id=stop_id
|
41 |
+
)
|
42 |
+
|
43 |
+
output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
|
44 |
+
example['output'] = output
|
45 |
+
|
46 |
+
return extract_generation_code(example, lang_code=lang)
|
47 |
+
|
48 |
+
def generate_main(args):
|
49 |
+
model_name_or_path = args.model
|
50 |
+
lang = args.language
|
51 |
+
saved_path = args.output_path
|
52 |
+
temp_dir = args.temp_dir
|
53 |
+
os.makedirs(temp_dir, exist_ok=True)
|
54 |
+
problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
|
55 |
+
|
56 |
+
print("model", model_name_or_path)
|
57 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
58 |
+
print("load tokenizer {} from {} over.".format(tokenizer.__class__, model_name_or_path))
|
59 |
+
model = AutoModelForCausalLM.from_pretrained(
|
60 |
+
model_name_or_path,
|
61 |
+
torch_dtype=torch.bfloat16,
|
62 |
+
device_map="auto",
|
63 |
+
#use_flash_attention_2=True
|
64 |
+
)
|
65 |
+
model.eval()
|
66 |
+
examples = [json.loads(x) for x in open(problem_file) if x.strip()]
|
67 |
+
print("Read {} examples for evaluation over.".format(len(examples)))
|
68 |
+
|
69 |
+
generated_examples = []
|
70 |
+
for ex in tqdm(examples, desc='Generating'):
|
71 |
+
gen_example = generate_one(ex, args.language, tokenizer, model)
|
72 |
+
generated_examples.append(gen_example)
|
73 |
+
|
74 |
+
print("Generate all over!!!")
|
75 |
+
with open(saved_path, 'w', encoding='utf-8') as fw:
|
76 |
+
for ex in generated_examples:
|
77 |
+
fw.write(json.dumps(ex) + '\n')
|
78 |
+
print("Save {} processed examples into {} over!".format(len(generated_examples), saved_path))
|
79 |
+
|
80 |
+
result = evaluate_functional_correctness(
|
81 |
+
input_file=saved_path,
|
82 |
+
tmp_dir=temp_dir,
|
83 |
+
n_workers=8,
|
84 |
+
timeout=3.0,
|
85 |
+
problem_file=problem_file,
|
86 |
+
language=lang
|
87 |
+
)
|
88 |
+
print(lang, result, model_name_or_path)
|
89 |
+
pass
|
90 |
+
|
91 |
+
def evaluation_only(args):
|
92 |
+
lang = args.language
|
93 |
+
temp_dir = args.temp_dir
|
94 |
+
assert os.path.exists(args.output_path), "Not fond output file: {}".format(args.output_path)
|
95 |
+
os.makedirs(temp_dir, exist_ok=True)
|
96 |
+
|
97 |
+
output_name = os.path.basename(args.output_path)
|
98 |
+
output_examples = [json.loads(x) for x in open(args.output_path) if x.strip()]
|
99 |
+
|
100 |
+
processed_examples = [extract_generation_code(ex, lang) for ex in tqdm(output_examples, "Processing")]
|
101 |
+
processed_path = os.path.join(temp_dir, output_name)
|
102 |
+
with open(processed_path, 'w', encoding='utf-8') as fw:
|
103 |
+
for ex in processed_examples:
|
104 |
+
fw.write(json.dumps(ex) + '\n')
|
105 |
+
print("Save {} processed examples into {} over!".format(len(processed_examples), processed_path))
|
106 |
+
|
107 |
+
problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
|
108 |
+
from human_eval.evaluation import evaluate_functional_correctness
|
109 |
+
result = evaluate_functional_correctness(
|
110 |
+
input_file=processed_path,
|
111 |
+
tmp_dir=temp_dir,
|
112 |
+
n_workers=8,
|
113 |
+
timeout=3.0,
|
114 |
+
problem_file=problem_file,
|
115 |
+
language=lang
|
116 |
+
)
|
117 |
+
print(lang, result)
|
118 |
+
|
119 |
+
if __name__ == '__main__':
|
120 |
+
parser = argparse.ArgumentParser()
|
121 |
+
parser.add_argument('--model', type=str, help="model name or path")
|
122 |
+
parser.add_argument('--output_path', type=str, help="output path of your generation")
|
123 |
+
parser.add_argument('--language', type=str, help="langauge")
|
124 |
+
parser.add_argument('--temp_dir', type=str, help="temp dir for evaluation", default="tmp")
|
125 |
+
args = parser.parse_args()
|
126 |
+
|
127 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
128 |
+
generate_main(args)
|
129 |
+
pass
|
DeepSeek-Coder-main/Evaluation/HumanEval/eval_pal.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import torch
|
5 |
+
import torch.nn.functional as F
|
6 |
+
import json
|
7 |
+
import torch.distributed as dist
|
8 |
+
import subprocess
|
9 |
+
import sys
|
10 |
+
from accelerate import Accelerator
|
11 |
+
from accelerate import DistributedDataParallelKwargs
|
12 |
+
from pathlib import Path
|
13 |
+
from argparse import ArgumentParser
|
14 |
+
from humaneval import HumanEval as evaltor
|
15 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
16 |
+
|
17 |
+
if __name__ == '__main__':
|
18 |
+
kwargs_handlers = [DistributedDataParallelKwargs(find_unused_parameters=True)]
|
19 |
+
accelerator = Accelerator(mixed_precision="bf16", kwargs_handlers=kwargs_handlers)
|
20 |
+
|
21 |
+
|
22 |
+
parser = ArgumentParser()
|
23 |
+
parser.add_argument("--logdir", type=str, default="")
|
24 |
+
parser.add_argument("--language", type=str, default="")
|
25 |
+
parser.add_argument("--dataroot", type=str, default="")
|
26 |
+
args = parser.parse_args()
|
27 |
+
|
28 |
+
logdir = args.logdir
|
29 |
+
language = args.language
|
30 |
+
|
31 |
+
if logdir == "":
|
32 |
+
logdir = "tmp/"
|
33 |
+
tokenizer = dict(
|
34 |
+
cls=AutoTokenizer,
|
35 |
+
model_path=logdir,)
|
36 |
+
|
37 |
+
dataroot = args.dataroot
|
38 |
+
|
39 |
+
evaluator = evaltor(data_root=dataroot, max_seq_len=4096, tokenizer_cfg=tokenizer, log_dir=logdir, n_sample=1, batch_size=1, language=language, max_gen_len=500)
|
40 |
+
model = AutoModelForCausalLM.from_pretrained(logdir, device_map=accelerator.device, trust_remote_code=True, torch_dtype=torch.bfloat16)
|
41 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
42 |
+
evaluator.eval_model(model, accelerator)
|
DeepSeek-Coder-main/Evaluation/HumanEval/human_eval/__init__.py
ADDED
File without changes
|
DeepSeek-Coder-main/Evaluation/HumanEval/human_eval/data.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Iterable, Dict
|
2 |
+
import gzip
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
|
6 |
+
|
7 |
+
ROOT = os.path.dirname(os.path.abspath(__file__))
|
8 |
+
HUMAN_EVAL = os.path.join(ROOT, "..", "data", "HumanEval.jsonl.gz")
|
9 |
+
|
10 |
+
|
11 |
+
def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]:
|
12 |
+
return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
|
13 |
+
|
14 |
+
|
15 |
+
def stream_jsonl(filename: str) -> Iterable[Dict]:
|
16 |
+
"""
|
17 |
+
Parses each jsonl line and yields it as a dictionary
|
18 |
+
"""
|
19 |
+
if filename.endswith(".gz"):
|
20 |
+
with open(filename, "rb") as gzfp:
|
21 |
+
with gzip.open(gzfp, 'rt') as fp:
|
22 |
+
for line in fp:
|
23 |
+
if any(not x.isspace() for x in line):
|
24 |
+
yield json.loads(line)
|
25 |
+
else:
|
26 |
+
with open(filename, "r", encoding="utf-8") as fp:
|
27 |
+
for line in fp:
|
28 |
+
if any(not x.isspace() for x in line):
|
29 |
+
yield json.loads(line)
|
30 |
+
|
31 |
+
|
32 |
+
def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
|
33 |
+
"""
|
34 |
+
Writes an iterable of dictionaries to jsonl
|
35 |
+
"""
|
36 |
+
if append:
|
37 |
+
mode = 'ab'
|
38 |
+
else:
|
39 |
+
mode = 'wb'
|
40 |
+
filename = os.path.expanduser(filename)
|
41 |
+
if filename.endswith(".gz"):
|
42 |
+
with open(filename, mode) as fp:
|
43 |
+
with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp:
|
44 |
+
for x in data:
|
45 |
+
gzfp.write((json.dumps(x) + "\n").encode('utf-8'))
|
46 |
+
else:
|
47 |
+
with open(filename, mode) as fp:
|
48 |
+
for x in data:
|
49 |
+
fp.write((json.dumps(x) + "\n").encode('utf-8'))
|
DeepSeek-Coder-main/Evaluation/HumanEval/human_eval/evaluate_functional_correctness.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fire
|
2 |
+
import sys
|
3 |
+
|
4 |
+
from .data import HUMAN_EVAL
|
5 |
+
from .evaluation import evaluate_functional_correctness
|
6 |
+
|
7 |
+
|
8 |
+
def entry_point(
|
9 |
+
sample_file: str,
|
10 |
+
k: str = "1,10,100",
|
11 |
+
n_workers: int = 4,
|
12 |
+
timeout: float = 3.0,
|
13 |
+
problem_file: str = "",
|
14 |
+
is_mbpp: bool = False,
|
15 |
+
):
|
16 |
+
"""
|
17 |
+
Evaluates the functional correctness of generated samples, and writes
|
18 |
+
results to f"{sample_file}_results.jsonl.gz"
|
19 |
+
"""
|
20 |
+
k = list(map(int, k.split(",")))
|
21 |
+
results = evaluate_functional_correctness(sample_file, k, n_workers, timeout, problem_file, is_mbpp)
|
22 |
+
print(results)
|
23 |
+
|
24 |
+
|
25 |
+
def main():
|
26 |
+
fire.Fire(entry_point)
|
27 |
+
|
28 |
+
|
29 |
+
sys.exit(main())
|
DeepSeek-Coder-main/Evaluation/HumanEval/human_eval/evaluation.py
ADDED
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import fire
|
4 |
+
import json
|
5 |
+
import gzip
|
6 |
+
import regex
|
7 |
+
import numpy as np
|
8 |
+
import itertools
|
9 |
+
|
10 |
+
from typing import *
|
11 |
+
from tqdm.auto import tqdm
|
12 |
+
from collections import defaultdict
|
13 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
14 |
+
from .data import stream_jsonl
|
15 |
+
from .execution import check_correctness
|
16 |
+
IMPORT_HELPER = {
|
17 |
+
"python": [
|
18 |
+
"import math",
|
19 |
+
"import re",
|
20 |
+
"import sys",
|
21 |
+
"import copy",
|
22 |
+
"import datetime",
|
23 |
+
"import itertools",
|
24 |
+
"import collections",
|
25 |
+
"import heapq",
|
26 |
+
"import functools",
|
27 |
+
"import hashlib",
|
28 |
+
"import numpy",
|
29 |
+
"import numpy as np",
|
30 |
+
"import string",
|
31 |
+
"from typing import *",
|
32 |
+
"from collections import *",
|
33 |
+
],
|
34 |
+
"go" : [
|
35 |
+
"math",
|
36 |
+
"strings",
|
37 |
+
"fmt",
|
38 |
+
"strconv",
|
39 |
+
"time",
|
40 |
+
"bytes",
|
41 |
+
"regexp",
|
42 |
+
"sort",
|
43 |
+
"math/rand",
|
44 |
+
"crypto/md5",
|
45 |
+
],
|
46 |
+
"cpp" : [
|
47 |
+
"#include<stdlib.h>",
|
48 |
+
"#include<algorithm>",
|
49 |
+
"#include<math.h>",
|
50 |
+
"#include<stdio.h>",
|
51 |
+
"#include<vector>",
|
52 |
+
"#include<string>",
|
53 |
+
"#include<climits>",
|
54 |
+
"#include<cstring>",
|
55 |
+
"#include<iostream>",
|
56 |
+
"#include<cassert>"
|
57 |
+
],
|
58 |
+
"cs": ["using System.Numerics;", "using System.Diagnostics;", "using System.Collections.Generic;", "using System.Linq;", "using System.Text;", "using System.Security.Cryptography;", "using System.Collections.Generic;"]
|
59 |
+
}
|
60 |
+
|
61 |
+
|
62 |
+
LANGUAGE_NAME = {
|
63 |
+
"cpp" : "CPP",
|
64 |
+
"go" : "Go",
|
65 |
+
"java" : "Java",
|
66 |
+
"js" : "JavaScript",
|
67 |
+
"python": "Python",
|
68 |
+
}
|
69 |
+
|
70 |
+
|
71 |
+
def read_dataset(
|
72 |
+
data_file: str = None,
|
73 |
+
dataset_type: str = "humaneval",
|
74 |
+
num_shot=None,
|
75 |
+
) -> Dict:
|
76 |
+
"""
|
77 |
+
Reads a dataset and returns a dictionary of tasks.
|
78 |
+
"""
|
79 |
+
if num_shot is not None:
|
80 |
+
print(f"{num_shot}-shot setting...")
|
81 |
+
if "humaneval" in dataset_type.lower():
|
82 |
+
if data_file is None:
|
83 |
+
current_path = os.path.dirname(os.path.abspath(__file__))
|
84 |
+
data_file = os.path.join(current_path, "..", "humaneval-x", "python", "data", "humaneval_python.jsonl.gz")
|
85 |
+
dataset = {task["task_id"]: task for task in stream_jsonl(data_file)}
|
86 |
+
else:
|
87 |
+
raise f"Dataset: {dataset_type} not supported."
|
88 |
+
|
89 |
+
return dataset
|
90 |
+
|
91 |
+
def estimate_pass_at_k(
|
92 |
+
num_samples: Union[int, List[int], np.ndarray],
|
93 |
+
num_correct: Union[List[int], np.ndarray],
|
94 |
+
k: int
|
95 |
+
) -> np.ndarray:
|
96 |
+
"""
|
97 |
+
Estimates pass@k of each problem and returns them in an array.
|
98 |
+
"""
|
99 |
+
|
100 |
+
def estimator(n: int, c: int, k: int) -> float:
|
101 |
+
"""
|
102 |
+
Calculates 1 - comb(n - c, k) / comb(n, k).
|
103 |
+
"""
|
104 |
+
if n - c < k:
|
105 |
+
return 1.0
|
106 |
+
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
|
107 |
+
|
108 |
+
if isinstance(num_samples, int):
|
109 |
+
num_samples_it = itertools.repeat(num_samples, len(num_correct))
|
110 |
+
else:
|
111 |
+
assert len(num_samples) == len(num_correct)
|
112 |
+
num_samples_it = iter(num_samples)
|
113 |
+
|
114 |
+
return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
|
115 |
+
|
116 |
+
def process_humaneval_test(sample, problems, example_test=False, is_mbpp=False, language="python"):
|
117 |
+
"""
|
118 |
+
Processes a sample for evaluation.
|
119 |
+
"""
|
120 |
+
task_id = sample["task_id"]
|
121 |
+
if is_mbpp:
|
122 |
+
return sample["generation"] + "\n" + "\n".join(problems[task_id]["test"])
|
123 |
+
|
124 |
+
prompt = sample["prompt"]
|
125 |
+
if example_test and "example_test" in problems[task_id] and problems[task_id]["example_test"] != "":
|
126 |
+
test = problems[task_id]["example_test"]
|
127 |
+
else:
|
128 |
+
test = problems[task_id]["test"]
|
129 |
+
code = sample["generation"]
|
130 |
+
|
131 |
+
# Pre-process for different languages
|
132 |
+
if language == "python":
|
133 |
+
test_setup = "\n".join(IMPORT_HELPER["python"]) + "\n"
|
134 |
+
test_string = test_setup + code + "\n" + test + "\n"
|
135 |
+
elif language == "cpp":
|
136 |
+
test_set_up = ""
|
137 |
+
for s in IMPORT_HELPER["cpp"]:
|
138 |
+
if s not in prompt:
|
139 |
+
test_set_up += s + "\n"
|
140 |
+
test_string = test_set_up + "\n" + code + "\n" + test
|
141 |
+
elif language == "java":
|
142 |
+
test_string = code + "\n" + test
|
143 |
+
elif language == "cs":
|
144 |
+
test_set_up = ""
|
145 |
+
for s in IMPORT_HELPER["cs"]:
|
146 |
+
test_set_up += s + "\n"
|
147 |
+
test_string = test_set_up + "\n" + code + "\n" + test
|
148 |
+
elif language in ["js", "javascript", "ts", "sh", "go"]:
|
149 |
+
test_string = code + "\n" + test
|
150 |
+
elif language == "go232":
|
151 |
+
import_string = problems[task_id]["import"]
|
152 |
+
prompt = prompt.replace(import_string, "")
|
153 |
+
if example_test and "example_test" in problems[task_id]:
|
154 |
+
test = problems[task_id]["example_test"]
|
155 |
+
else:
|
156 |
+
test = problems[task_id]["test"]
|
157 |
+
test_setup = problems[task_id]["test_setup"]
|
158 |
+
other_pkgs = []
|
159 |
+
for pkg in IMPORT_HELPER["go"]:
|
160 |
+
if pkg not in test_setup:
|
161 |
+
p = pkg.split("/")[-1]
|
162 |
+
if p + "." in code:
|
163 |
+
other_pkgs.append(f"\"{pkg}\"")
|
164 |
+
if other_pkgs:
|
165 |
+
import_other_pkgs = "import (\n" + " ".join([p + "\n" for p in other_pkgs]) + ")"
|
166 |
+
test_string = test_setup + "\n" + import_other_pkgs + "\n" + prompt + code + "\n" + test
|
167 |
+
else:
|
168 |
+
test_string = test_setup + "\n" + prompt + code + "\n" + test
|
169 |
+
elif language == "rust":
|
170 |
+
main = "\nfn main(){ \n } \n"
|
171 |
+
declaration = problems[task_id]["declaration"]
|
172 |
+
test_string = main + declaration + prompt + code + test
|
173 |
+
elif language == "php":
|
174 |
+
if code[:5] != "<?php":
|
175 |
+
code = "<?php\n" + code
|
176 |
+
test_string = code + "\n" + test + "?>"
|
177 |
+
return test_string
|
178 |
+
|
179 |
+
|
180 |
+
def stream_jsonl_all(filename: str) -> Iterable[Dict]:
|
181 |
+
"""
|
182 |
+
Streams a JSONL file.
|
183 |
+
"""
|
184 |
+
results = []
|
185 |
+
if filename.endswith(".gz"):
|
186 |
+
fp = gzip.open(open(filename, "rb"), "rt")
|
187 |
+
else:
|
188 |
+
fp = open(filename, "r")
|
189 |
+
for line in fp:
|
190 |
+
if any(not x.isspace() for x in line):
|
191 |
+
results.append(json.loads(line))
|
192 |
+
fp.close()
|
193 |
+
|
194 |
+
return results
|
195 |
+
|
196 |
+
|
197 |
+
def evaluate_functional_correctness(
|
198 |
+
input_file: str = None,
|
199 |
+
tmp_dir: str = "./",
|
200 |
+
n_workers: int = 32,
|
201 |
+
timeout: float = 10.0,
|
202 |
+
problem_file: str = "../data/humaneval_python.jsonl.gz",
|
203 |
+
out_dir: str = None,
|
204 |
+
k: List[int] = [1, 10, 100],
|
205 |
+
test_groundtruth: bool = False,
|
206 |
+
example_test: bool = False,
|
207 |
+
is_mbpp: bool = False,
|
208 |
+
language: str = "python",
|
209 |
+
):
|
210 |
+
"""
|
211 |
+
Evaluates the functional correctness of a model.
|
212 |
+
"""
|
213 |
+
if example_test:
|
214 |
+
print("Example test...")
|
215 |
+
|
216 |
+
problems = read_dataset(problem_file,
|
217 |
+
dataset_type="humaneval")
|
218 |
+
sample_jsonl = stream_jsonl_all(input_file)
|
219 |
+
|
220 |
+
|
221 |
+
with ThreadPoolExecutor(max_workers=n_workers) as executor:
|
222 |
+
|
223 |
+
futures = []
|
224 |
+
completion_id = Counter()
|
225 |
+
n_samples = 0
|
226 |
+
results = defaultdict(list)
|
227 |
+
|
228 |
+
if test_groundtruth:
|
229 |
+
print("Testing ground truth...")
|
230 |
+
for sample in tqdm(problems.values()):
|
231 |
+
task_id = sample["task_id"]
|
232 |
+
lang = task_id.split("/")[0].lower()
|
233 |
+
if lang == "javascript":
|
234 |
+
lang = "js"
|
235 |
+
tmp_dir_ = os.path.join(tmp_dir, lang, "evaluation")
|
236 |
+
sample["generation"] = sample["canonical_solution"]
|
237 |
+
sample["test_code"] = process_humaneval_test(sample, problems, example_test, language)
|
238 |
+
if sample["test_code"] is None:
|
239 |
+
continue
|
240 |
+
args = (task_id, sample, lang, timeout, tmp_dir_, completion_id[task_id])
|
241 |
+
future = executor.submit(check_correctness, *args)
|
242 |
+
futures.append(future)
|
243 |
+
completion_id[task_id] += 1
|
244 |
+
n_samples += 1
|
245 |
+
else:
|
246 |
+
print("Reading samples...")
|
247 |
+
for sample in tqdm(sample_jsonl):
|
248 |
+
task_id = sample["task_id"]
|
249 |
+
if not is_mbpp:
|
250 |
+
lang = language
|
251 |
+
if not is_mbpp and lang == "javascript":
|
252 |
+
lang = "js"
|
253 |
+
if is_mbpp:
|
254 |
+
lang = "python"
|
255 |
+
tmp_dir_ = os.path.join(tmp_dir, lang, "evaluation")
|
256 |
+
sample["task_id"] = task_id
|
257 |
+
sample["test_code"] = process_humaneval_test(sample, problems, example_test, is_mbpp, language)
|
258 |
+
if sample["test_code"] is None:
|
259 |
+
continue
|
260 |
+
if "completion_id" in sample:
|
261 |
+
completion_id_ = sample["completion_id"]
|
262 |
+
else:
|
263 |
+
completion_id_ = completion_id[task_id]
|
264 |
+
args = (task_id, sample, lang, timeout, tmp_dir_, completion_id_)
|
265 |
+
future = executor.submit(check_correctness, *args)
|
266 |
+
futures.append(future)
|
267 |
+
completion_id[task_id] += 1
|
268 |
+
n_samples += 1
|
269 |
+
|
270 |
+
if len(completion_id) == len(problems):
|
271 |
+
evaluate_pass_at_k = True
|
272 |
+
else:
|
273 |
+
evaluate_pass_at_k = False
|
274 |
+
|
275 |
+
print("Running test suites...")
|
276 |
+
for future in tqdm(as_completed(futures), total=len(futures)):
|
277 |
+
result = future.result()
|
278 |
+
results[result["task_id"]].append((result["completion_id"], result))
|
279 |
+
|
280 |
+
# Calculate pass@k.
|
281 |
+
total, correct = [], []
|
282 |
+
for result in results.values():
|
283 |
+
passed = [r[1]["passed"] for r in result]
|
284 |
+
total.append(len(passed))
|
285 |
+
correct.append(sum(passed))
|
286 |
+
total = np.array(total)
|
287 |
+
correct = np.array(correct)
|
288 |
+
if evaluate_pass_at_k:
|
289 |
+
ks = k
|
290 |
+
pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
|
291 |
+
for k in ks if (total >= k).all()}
|
292 |
+
print(pass_at_k)
|
293 |
+
else:
|
294 |
+
print("Total:", np.sum(total))
|
295 |
+
print("Correct:", np.sum(correct))
|
296 |
+
return pass_at_k
|
DeepSeek-Coder-main/Evaluation/HumanEval/human_eval/execution.py
ADDED
@@ -0,0 +1,731 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import contextlib
|
2 |
+
import faulthandler
|
3 |
+
import io
|
4 |
+
import multiprocessing
|
5 |
+
import os
|
6 |
+
import platform
|
7 |
+
import signal
|
8 |
+
import random
|
9 |
+
import subprocess
|
10 |
+
import tempfile
|
11 |
+
import gzip
|
12 |
+
import json
|
13 |
+
from typing import *
|
14 |
+
import traceback
|
15 |
+
|
16 |
+
java_exec = ""
|
17 |
+
node_exec = ""
|
18 |
+
tsc_exec = ""
|
19 |
+
go_exec = ""
|
20 |
+
php_exec = ""
|
21 |
+
cs_exec = ""
|
22 |
+
|
23 |
+
def check_correctness(
|
24 |
+
task_id: str,
|
25 |
+
sample: dict,
|
26 |
+
language_type: str,
|
27 |
+
timeout: float = 3.0,
|
28 |
+
tmp_dir: str = None,
|
29 |
+
completion_id: Optional[int] = None,
|
30 |
+
) -> Dict:
|
31 |
+
"""
|
32 |
+
Evaluates the functional correctness of a completion by running the test
|
33 |
+
suite provided in the problem.
|
34 |
+
"""
|
35 |
+
|
36 |
+
def unsafe_execute(tmp_dir):
|
37 |
+
random_id = random.randint(1, 100000)
|
38 |
+
if "python" in language_type.lower():
|
39 |
+
with create_tempdir():
|
40 |
+
|
41 |
+
# These system calls are needed when cleaning up tempdir.
|
42 |
+
import os
|
43 |
+
import shutil
|
44 |
+
rmtree = shutil.rmtree
|
45 |
+
rmdir = os.rmdir
|
46 |
+
chdir = os.chdir
|
47 |
+
|
48 |
+
# Disable functionalities that can make destructive changes to the test.
|
49 |
+
reliability_guard()
|
50 |
+
|
51 |
+
try:
|
52 |
+
exec_globals = {}
|
53 |
+
with swallow_io():
|
54 |
+
with time_limit(timeout):
|
55 |
+
# WARNING
|
56 |
+
# This program exists to execute untrusted model-generated code. Although
|
57 |
+
# it is highly unlikely that model-generated code will do something overtly
|
58 |
+
# malicious in response to this test suite, model-generated code may act
|
59 |
+
# destructively due to a lack of model capability or alignment.
|
60 |
+
# Users are strongly encouraged to sandbox this evaluation suite so that it
|
61 |
+
# does not perform destructive actions on their host or network.
|
62 |
+
# Once you have read this disclaimer and taken appropriate precautions,
|
63 |
+
# uncomment the following line and proceed at your own risk:
|
64 |
+
exec(sample["test_code"], exec_globals)
|
65 |
+
result.append("passed")
|
66 |
+
except TimeoutException:
|
67 |
+
result.append("timed out")
|
68 |
+
except AssertionError as e:
|
69 |
+
result.append(f"failed: AssertionError")
|
70 |
+
except BaseException as e:
|
71 |
+
result.append(f"failed: {e}")
|
72 |
+
#print(sample["test_code"])
|
73 |
+
#print(result)
|
74 |
+
# Needed for cleaning up.
|
75 |
+
shutil.rmtree = rmtree
|
76 |
+
os.rmdir = rmdir
|
77 |
+
os.chdir = chdir
|
78 |
+
|
79 |
+
elif "go" in language_type.lower():
|
80 |
+
assert tmp_dir is not None, "Go should be evaluated in a dir where necessary module files installed."
|
81 |
+
|
82 |
+
import os
|
83 |
+
import shutil
|
84 |
+
|
85 |
+
if "tmp" not in tmp_dir:
|
86 |
+
tmp_dir = os.path.join(tmp_dir, "tmp")
|
87 |
+
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
88 |
+
if not os.path.exists(tmp_dir):
|
89 |
+
os.makedirs(tmp_dir)
|
90 |
+
origin_path = os.getcwd()
|
91 |
+
os.chdir(tmp_dir)
|
92 |
+
open(f"main_test.go", 'w').write(sample["test_code"])
|
93 |
+
try:
|
94 |
+
exec_result = None
|
95 |
+
with time_limit(timeout):
|
96 |
+
# WARNING
|
97 |
+
# This program exists to execute untrusted model-generated code. Although
|
98 |
+
# it is highly unlikely that model-generated code will do something overtly
|
99 |
+
# malicious in response to this test suite, model-generated code may act
|
100 |
+
# destructively due to a lack of model capability or alignment.
|
101 |
+
# Users are strongly encouraged to sandbox this evaluation suite so that it
|
102 |
+
# does not perform destructive actions on their host or network.
|
103 |
+
# Once you have read this disclaimer and taken appropriate precautions,
|
104 |
+
# uncomment the following line and proceed at your own risk:
|
105 |
+
exec_result = subprocess.run([f"{go_exec}go", "test", f"-timeout={timeout}s", "main_test.go"], timeout=timeout, capture_output=True)
|
106 |
+
|
107 |
+
if exec_result.returncode == 0:
|
108 |
+
result.append("passed")
|
109 |
+
else:
|
110 |
+
if exec_result.stderr:
|
111 |
+
try:
|
112 |
+
err = exec_result.stderr.decode()
|
113 |
+
except:
|
114 |
+
err = exec_result.stderr
|
115 |
+
else:
|
116 |
+
try:
|
117 |
+
err = exec_result.stdout.decode()
|
118 |
+
except:
|
119 |
+
err = exec_result.stdout
|
120 |
+
result.append(f"failed: {err}")
|
121 |
+
|
122 |
+
except TimeoutException:
|
123 |
+
result.append("timed out")
|
124 |
+
os.chdir(origin_path)
|
125 |
+
shutil.rmtree(tmp_dir)
|
126 |
+
elif "js" in language_type.lower():
|
127 |
+
import os
|
128 |
+
import shutil
|
129 |
+
|
130 |
+
if "tmp" not in tmp_dir:
|
131 |
+
tmp_dir = os.path.join(tmp_dir, "tmp")
|
132 |
+
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
133 |
+
if not os.path.exists(tmp_dir):
|
134 |
+
os.makedirs(tmp_dir)
|
135 |
+
origin_path = os.getcwd()
|
136 |
+
os.chdir(tmp_dir)
|
137 |
+
open(f"test.js", 'w').write(sample["test_code"])
|
138 |
+
try:
|
139 |
+
exec_result = None
|
140 |
+
with time_limit(timeout):
|
141 |
+
# WARNING
|
142 |
+
# This program exists to execute untrusted model-generated code. Although
|
143 |
+
# it is highly unlikely that model-generated code will do something overtly
|
144 |
+
# malicious in response to this test suite, model-generated code may act
|
145 |
+
# destructively due to a lack of model capability or alignment.
|
146 |
+
# Users are strongly encouraged to sandbox this evaluation suite so that it
|
147 |
+
# does not perform destructive actions on their host or network.
|
148 |
+
# Once you have read this disclaimer and taken appropriate precautions,
|
149 |
+
# uncomment the following line and proceed at your own risk:
|
150 |
+
exec_result = subprocess.run([f"{node_exec}node", "test.js"], timeout=timeout, capture_output=True)
|
151 |
+
|
152 |
+
if exec_result.stderr.decode():
|
153 |
+
err = exec_result.stderr.decode()
|
154 |
+
result.append(f"failed: {err}")
|
155 |
+
elif exec_result.stdout.decode():
|
156 |
+
err = exec_result.stdout.decode()
|
157 |
+
result.append(f"failed: {err}")
|
158 |
+
else:
|
159 |
+
result.append("passed")
|
160 |
+
|
161 |
+
except TimeoutException:
|
162 |
+
result.append("timed out")
|
163 |
+
os.chdir(origin_path)
|
164 |
+
shutil.rmtree(tmp_dir)
|
165 |
+
elif "cpp" in language_type.lower():
|
166 |
+
import os
|
167 |
+
import shutil
|
168 |
+
origin_path = os.getcwd()
|
169 |
+
if "tmp" not in tmp_dir:
|
170 |
+
tmp_dir = os.path.join(tmp_dir, "tmp")
|
171 |
+
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
172 |
+
if not os.path.exists(tmp_dir):
|
173 |
+
os.makedirs(tmp_dir)
|
174 |
+
|
175 |
+
os.chdir(tmp_dir)
|
176 |
+
open(f"test.cpp", 'w').write(sample["test_code"])
|
177 |
+
if "162" in task_id:
|
178 |
+
compilation_result = subprocess.run(["/usr/bin/g++", "-std=c++17", "test.cpp", "-lcrypto", "-lssl"],
|
179 |
+
timeout=timeout,
|
180 |
+
capture_output=True)
|
181 |
+
else:
|
182 |
+
compilation_result = subprocess.run(["/usr/bin/g++", "-std=c++17", "test.cpp"], timeout=timeout,
|
183 |
+
capture_output=True)
|
184 |
+
if compilation_result.returncode != 0:
|
185 |
+
if compilation_result.stderr:
|
186 |
+
err = compilation_result.stderr.decode()
|
187 |
+
else:
|
188 |
+
err = compilation_result.stdout.decode()
|
189 |
+
result.append(f"failed: compilation error: {err}")
|
190 |
+
else:
|
191 |
+
try:
|
192 |
+
exec_result = None
|
193 |
+
with time_limit(timeout):
|
194 |
+
# WARNING
|
195 |
+
# This program exists to execute untrusted model-generated code. Although
|
196 |
+
# it is highly unlikely that model-generated code will do something overtly
|
197 |
+
# malicious in response to this test suite, model-generated code may act
|
198 |
+
# destructively due to a lack of model capability or alignment.
|
199 |
+
# Users are strongly encouraged to sandbox this evaluation suite so that it
|
200 |
+
# does not perform destructive actions on their host or network.
|
201 |
+
# Once you have read this disclaimer and taken appropriate precautions,
|
202 |
+
# uncomment the following line and proceed at your own risk:
|
203 |
+
exec_result = subprocess.run(["./a.out"], timeout=timeout, capture_output=True)
|
204 |
+
|
205 |
+
if exec_result.returncode == 0:
|
206 |
+
result.append("passed")
|
207 |
+
else:
|
208 |
+
if exec_result.stderr:
|
209 |
+
try:
|
210 |
+
err = exec_result.stderr.decode()
|
211 |
+
except:
|
212 |
+
err = exec_result.stderr
|
213 |
+
else:
|
214 |
+
try:
|
215 |
+
err = exec_result.stdout.decode()
|
216 |
+
except:
|
217 |
+
err = exec_result.stdout
|
218 |
+
result.append(f"failed: {err}")
|
219 |
+
except TimeoutException:
|
220 |
+
result.append("timed out")
|
221 |
+
#print(result[-1])
|
222 |
+
#print(sample["test_code"])
|
223 |
+
os.chdir(origin_path)
|
224 |
+
shutil.rmtree(tmp_dir)
|
225 |
+
elif "php" in language_type.lower():
|
226 |
+
import os
|
227 |
+
import shutil
|
228 |
+
origin_path = os.getcwd()
|
229 |
+
if "tmp" not in tmp_dir:
|
230 |
+
tmp_dir = os.path.join(tmp_dir, "tmp")
|
231 |
+
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
232 |
+
if not os.path.exists(tmp_dir):
|
233 |
+
os.makedirs(tmp_dir)
|
234 |
+
|
235 |
+
os.chdir(tmp_dir)
|
236 |
+
open(f"test.php", 'w').write(sample["test_code"])
|
237 |
+
try:
|
238 |
+
exec_result = None
|
239 |
+
with time_limit(timeout):
|
240 |
+
cmd = f"{php_exec}php -f test.php"
|
241 |
+
exec_result = subprocess.run(cmd, timeout=timeout, capture_output=True, shell=True)
|
242 |
+
|
243 |
+
if exec_result.returncode == 0:
|
244 |
+
result.append("passed")
|
245 |
+
else:
|
246 |
+
if exec_result.stderr:
|
247 |
+
try:
|
248 |
+
err = exec_result.stderr.decode()
|
249 |
+
except:
|
250 |
+
err = exec_result.stderr
|
251 |
+
else:
|
252 |
+
try:
|
253 |
+
err = exec_result.stdout.decode()
|
254 |
+
except:
|
255 |
+
err = exec_result.stdout
|
256 |
+
result.append(f"failed: {err}")
|
257 |
+
except TimeoutException:
|
258 |
+
result.append("timed out")
|
259 |
+
print(result[-1])
|
260 |
+
print(sample["test_code"])
|
261 |
+
os.chdir(origin_path)
|
262 |
+
shutil.rmtree(tmp_dir)
|
263 |
+
elif "sh" in language_type.lower():
|
264 |
+
import os
|
265 |
+
import shutil
|
266 |
+
origin_path = os.getcwd()
|
267 |
+
if "tmp" not in tmp_dir:
|
268 |
+
tmp_dir = os.path.join(tmp_dir, "tmp")
|
269 |
+
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
270 |
+
if not os.path.exists(tmp_dir):
|
271 |
+
os.makedirs(tmp_dir)
|
272 |
+
|
273 |
+
os.chdir(tmp_dir)
|
274 |
+
open(f"test.sh", 'w').write(sample["test_code"])
|
275 |
+
try:
|
276 |
+
exec_result = None
|
277 |
+
with time_limit(timeout):
|
278 |
+
cmd = "/bin/bash test.sh"
|
279 |
+
exec_result = subprocess.run(cmd, timeout=10, capture_output=True, shell=True)
|
280 |
+
|
281 |
+
if exec_result.returncode == 0:
|
282 |
+
result.append("passed")
|
283 |
+
else:
|
284 |
+
if exec_result.stderr:
|
285 |
+
try:
|
286 |
+
err = exec_result.stderr.decode()
|
287 |
+
except:
|
288 |
+
err = exec_result.stderr
|
289 |
+
else:
|
290 |
+
try:
|
291 |
+
err = exec_result.stdout.decode()
|
292 |
+
except:
|
293 |
+
err = exec_result.stdout
|
294 |
+
result.append(f"failed: {err}")
|
295 |
+
except TimeoutException:
|
296 |
+
result.append("timed out")
|
297 |
+
#print(result[-1])
|
298 |
+
#print(sample["test_code"])
|
299 |
+
os.chdir(origin_path)
|
300 |
+
shutil.rmtree(tmp_dir)
|
301 |
+
elif "ts" in language_type.lower():
|
302 |
+
import os
|
303 |
+
import shutil
|
304 |
+
origin_path = os.getcwd()
|
305 |
+
if "tmp" not in tmp_dir:
|
306 |
+
tmp_dir = os.path.join(tmp_dir, "tmp")
|
307 |
+
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
308 |
+
if not os.path.exists(tmp_dir):
|
309 |
+
os.makedirs(tmp_dir)
|
310 |
+
|
311 |
+
os.chdir(tmp_dir)
|
312 |
+
env = {"PATH": f"{node_exec}:" + subprocess.os.environ["PATH"]}
|
313 |
+
open(f"test.ts", 'w').write(sample["test_code"])
|
314 |
+
cmd = f"{tsc_exec}tsc test.ts --target ES2015 --lib ES2015,DOM"
|
315 |
+
compilation_result = subprocess.run(cmd, timeout=timeout, capture_output=True, env=env, shell=True)
|
316 |
+
if compilation_result.returncode != 0:
|
317 |
+
if compilation_result.stderr:
|
318 |
+
err = compilation_result.stderr.decode()
|
319 |
+
else:
|
320 |
+
err = compilation_result.stdout.decode()
|
321 |
+
result.append(f"failed: compilation error: {err}")
|
322 |
+
else:
|
323 |
+
try:
|
324 |
+
exec_result = None
|
325 |
+
with time_limit(timeout):
|
326 |
+
exec_result = subprocess.run([f"{node_exec}node", "test.js"], timeout=timeout, capture_output=True)
|
327 |
+
|
328 |
+
if exec_result.returncode == 0:
|
329 |
+
result.append("passed")
|
330 |
+
else:
|
331 |
+
if exec_result.stderr:
|
332 |
+
try:
|
333 |
+
err = exec_result.stderr.decode()
|
334 |
+
except:
|
335 |
+
err = exec_result.stderr
|
336 |
+
else:
|
337 |
+
try:
|
338 |
+
err = exec_result.stdout.decode()
|
339 |
+
except:
|
340 |
+
err = exec_result.stdout
|
341 |
+
result.append(f"failed: {err}")
|
342 |
+
except TimeoutException:
|
343 |
+
result.append("timed out")
|
344 |
+
if result[-1] != "passed":
|
345 |
+
env = {"PATH": f"{node_exec}:" + subprocess.os.environ["PATH"]}
|
346 |
+
cmd = f"{tsc_exec}tsc test.ts"
|
347 |
+
compilation_result = subprocess.run(cmd, timeout=timeout, capture_output=True, env=env, shell=True)
|
348 |
+
if compilation_result.returncode != 0:
|
349 |
+
if compilation_result.stderr:
|
350 |
+
err = compilation_result.stderr.decode()
|
351 |
+
else:
|
352 |
+
err = compilation_result.stdout.decode()
|
353 |
+
result[-1] = f"failed: compilation error: {err}"
|
354 |
+
else:
|
355 |
+
try:
|
356 |
+
exec_result = None
|
357 |
+
with time_limit(timeout):
|
358 |
+
exec_result = subprocess.run([f"{node_exec}node", "test.js"], timeout=timeout, capture_output=True)
|
359 |
+
|
360 |
+
if exec_result.returncode == 0:
|
361 |
+
result[-1] = "passed"
|
362 |
+
else:
|
363 |
+
if exec_result.stderr:
|
364 |
+
try:
|
365 |
+
err = exec_result.stderr.decode()
|
366 |
+
except:
|
367 |
+
err = exec_result.stderr
|
368 |
+
else:
|
369 |
+
try:
|
370 |
+
err = exec_result.stdout.decode()
|
371 |
+
except:
|
372 |
+
err = exec_result.stdout
|
373 |
+
result[-1] = f"failed: {err}"
|
374 |
+
except TimeoutException:
|
375 |
+
result[-1] = "timed out"
|
376 |
+
|
377 |
+
os.chdir(origin_path)
|
378 |
+
shutil.rmtree(tmp_dir)
|
379 |
+
elif "cs" in language_type.lower():
|
380 |
+
import os
|
381 |
+
import shutil
|
382 |
+
origin_path = os.getcwd()
|
383 |
+
if "tmp" not in tmp_dir:
|
384 |
+
tmp_dir = os.path.join(tmp_dir, "tmp")
|
385 |
+
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
386 |
+
if not os.path.exists(tmp_dir):
|
387 |
+
os.makedirs(tmp_dir)
|
388 |
+
os.chdir(tmp_dir)
|
389 |
+
open(f"Program.cs", 'w').write(sample["test_code"])
|
390 |
+
cmd = f"{cs_exec}mcs -d:DEBUG Program.cs"
|
391 |
+
compilation_result = subprocess.run(cmd, shell=True, capture_output=True)
|
392 |
+
if compilation_result.returncode != 0:
|
393 |
+
if compilation_result.stderr:
|
394 |
+
err = compilation_result.stderr.decode()
|
395 |
+
else:
|
396 |
+
err = compilation_result.stdout.decode()
|
397 |
+
result.append(f"failed: compilation error: {err}")
|
398 |
+
else:
|
399 |
+
try:
|
400 |
+
exec_result = None
|
401 |
+
cmd = f"{cs_exec}mono Program.exe"
|
402 |
+
env = dict(MONO_TRACE_LISTENER="Console.Error")
|
403 |
+
with time_limit(timeout):
|
404 |
+
exec_result = subprocess.run(cmd, timeout=timeout, shell=True, capture_output=True, env=env)
|
405 |
+
|
406 |
+
if "Fail" not in exec_result.stderr.decode():
|
407 |
+
result.append("passed")
|
408 |
+
else:
|
409 |
+
if exec_result.stderr:
|
410 |
+
try:
|
411 |
+
err = exec_result.stderr.decode()
|
412 |
+
except:
|
413 |
+
err = exec_result.stderr
|
414 |
+
else:
|
415 |
+
try:
|
416 |
+
err = exec_result.stdout.decode()
|
417 |
+
except:
|
418 |
+
err = exec_result.stdout
|
419 |
+
result.append(f"failed: {err}")
|
420 |
+
except TimeoutException:
|
421 |
+
result.append("timed out")
|
422 |
+
except Exception as e:
|
423 |
+
result.append(f"failed: {e}")
|
424 |
+
os.chdir(origin_path)
|
425 |
+
shutil.rmtree(tmp_dir)
|
426 |
+
elif "rust" in language_type.lower():
|
427 |
+
import os
|
428 |
+
|
429 |
+
WD: str = os.path.dirname(os.path.abspath(__file__))
|
430 |
+
RUST_DIR: str = os.path.join(WD, "rust")
|
431 |
+
RUST_SRC: str = os.path.join(RUST_DIR, "src")
|
432 |
+
RUST_BIN: str = os.path.join(RUST_SRC, "bin")
|
433 |
+
RUST_TMP_DIR: str = os.path.join(RUST_DIR, "tmp")
|
434 |
+
RUST_LOGS: str = os.path.join(RUST_TMP_DIR, "logs")
|
435 |
+
RUST_EXT: str = ".rs"
|
436 |
+
|
437 |
+
# Create mandatory tmp directories
|
438 |
+
os.makedirs(RUST_TMP_DIR, exist_ok=True)
|
439 |
+
os.makedirs(RUST_LOGS, exist_ok=True)
|
440 |
+
os.makedirs(RUST_SRC, exist_ok=True)
|
441 |
+
os.makedirs(RUST_BIN, exist_ok=True)
|
442 |
+
|
443 |
+
with tempfile.NamedTemporaryFile(dir = RUST_BIN, delete=False) as f:
|
444 |
+
#temporal file name
|
445 |
+
file_prefix = sample["task_id"].lower().replace("/", "_")
|
446 |
+
file_name:str = file_prefix +RUST_EXT
|
447 |
+
|
448 |
+
os.rename(f.name, os.path.join(RUST_BIN, file_name))
|
449 |
+
|
450 |
+
# Sample to pure Rust function
|
451 |
+
rust_code: str = sample["test_code"]
|
452 |
+
|
453 |
+
# dump the rust source code in the target temporal file
|
454 |
+
f.write(rust_code.encode('utf-8'))
|
455 |
+
|
456 |
+
# Proceed towards Rust binaries compilation. Therefore move to Rust module root dir.
|
457 |
+
os.chdir(RUST_DIR)
|
458 |
+
|
459 |
+
# Two possible outcomes
|
460 |
+
# Pass OR Fail compilation
|
461 |
+
log_filename: str = file_prefix + ".jsonl"
|
462 |
+
log_path: str = os.path.join(RUST_LOGS, log_filename)
|
463 |
+
cargo_check: str = "cargo check --bin " + file_prefix + " --message-format json >> " + log_path
|
464 |
+
# Compilation build status
|
465 |
+
returned_val_compilation: int
|
466 |
+
|
467 |
+
# Overwrite file content
|
468 |
+
if os.path.exists(log_path):
|
469 |
+
if(file_size := os.path.getsize(log_path)) >= 0:
|
470 |
+
os.remove(log_path)
|
471 |
+
returned_val_compilation = os.system(cargo_check)
|
472 |
+
|
473 |
+
else:
|
474 |
+
returned_val_compilation = os.system(cargo_check)
|
475 |
+
|
476 |
+
# 0 means success
|
477 |
+
if returned_val_compilation == 0:
|
478 |
+
|
479 |
+
#Execution pipeline
|
480 |
+
cargo_test: str = "cargo test --bin " +file_prefix+ " --message-format json >> " + log_path
|
481 |
+
returned_val_execution = os.system(cargo_test)
|
482 |
+
|
483 |
+
if returned_val_execution == 0:
|
484 |
+
result.append("passed")
|
485 |
+
else:
|
486 |
+
result.append(f"failed: execution error")
|
487 |
+
|
488 |
+
else:
|
489 |
+
result.append(f"failed: compilation error")
|
490 |
+
|
491 |
+
|
492 |
+
elif "java" in language_type.lower():
|
493 |
+
assert tmp_dir is not None, "Java should be evaluated in a temporary dir."
|
494 |
+
|
495 |
+
import os
|
496 |
+
import shutil
|
497 |
+
|
498 |
+
if "tmp" not in tmp_dir:
|
499 |
+
tmp_dir = os.path.join(tmp_dir, "tmp")
|
500 |
+
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
501 |
+
if not os.path.exists(tmp_dir):
|
502 |
+
os.makedirs(tmp_dir)
|
503 |
+
open(os.path.join(tmp_dir, "Problem.java"), 'w').write(sample["test_code"])
|
504 |
+
origin_path = os.getcwd()
|
505 |
+
os.system(f"cp ./javatuples-1.2.jar {tmp_dir}/")
|
506 |
+
os.chdir(tmp_dir)
|
507 |
+
res = "failed: unknown error"
|
508 |
+
compile_returncode = -1
|
509 |
+
for _ in range(5):
|
510 |
+
try:
|
511 |
+
cmd = f"{java_exec}javac -cp javatuples-1.2.jar Problem.java"
|
512 |
+
compilation_result = subprocess.run(cmd, timeout=60, capture_output=True, shell=True)
|
513 |
+
compile_returncode = compilation_result.returncode
|
514 |
+
break
|
515 |
+
except subprocess.TimeoutExpired as e:
|
516 |
+
continue
|
517 |
+
if compile_returncode != 0:
|
518 |
+
res = "failed: compilation error"
|
519 |
+
else:
|
520 |
+
exec_result = None
|
521 |
+
try:
|
522 |
+
# WARNING
|
523 |
+
# This program exists to execute untrusted model-generated code. Although
|
524 |
+
# it is highly unlikely that model-generated code will do something overtly
|
525 |
+
# malicious in response to this test suite, model-generated code may act
|
526 |
+
# destructively due to a lack of model capability or alignment.
|
527 |
+
# Users are strongly encouraged to sandbox this evaluation suite so that it
|
528 |
+
# does not perform destructive actions on their host or network.
|
529 |
+
# Once you have read this disclaimer and taken appropriate precautions,
|
530 |
+
# uncomment the following line and proceed at your own risk:
|
531 |
+
cmd = f"{java_exec}java -ea -cp .:javatuples-1.2.jar Problem"
|
532 |
+
exec_result = subprocess.run(cmd, timeout=timeout, capture_output=True, shell=True)
|
533 |
+
if exec_result.returncode == 0:
|
534 |
+
res = "passed"
|
535 |
+
elif exec_result.returncode == 1:
|
536 |
+
if "AssertionError" in exec_result.stderr.decode('unicode-escape'):
|
537 |
+
res = "failed: wrong answer"
|
538 |
+
else:
|
539 |
+
res = f"failed: {exec_result.stderr.decode()}"
|
540 |
+
except subprocess.TimeoutExpired as e:
|
541 |
+
res = "time out"
|
542 |
+
except BaseException as e:
|
543 |
+
res = f"failed: {e}"
|
544 |
+
|
545 |
+
result.append(res)
|
546 |
+
os.chdir(origin_path)
|
547 |
+
shutil.rmtree(tmp_dir)
|
548 |
+
|
549 |
+
manager = multiprocessing.Manager()
|
550 |
+
result = manager.list()
|
551 |
+
|
552 |
+
p = multiprocessing.Process(target=unsafe_execute, args=(tmp_dir,))
|
553 |
+
p.start()
|
554 |
+
p.join(timeout=timeout + 1)
|
555 |
+
if p.is_alive():
|
556 |
+
p.kill()
|
557 |
+
|
558 |
+
if not result:
|
559 |
+
result.append("timed out")
|
560 |
+
|
561 |
+
return {
|
562 |
+
"task_id" : task_id,
|
563 |
+
"completion_id": completion_id,
|
564 |
+
"result" : result[0],
|
565 |
+
"passed" : result[0] == "passed",
|
566 |
+
"finish" : -1 if "finish" not in sample else sample["finish"],
|
567 |
+
"code" : sample["test_code"]
|
568 |
+
}
|
569 |
+
|
570 |
+
# Copyright (c) OpenAI (https://openai.com)
|
571 |
+
|
572 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
573 |
+
# of this software and associated documentation files (the "Software"), to deal
|
574 |
+
# in the Software without restriction, including without limitation the rights
|
575 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
576 |
+
# copies of the Software, and to permit persons to whom the Software is
|
577 |
+
# furnished to do so, subject to the following conditions:
|
578 |
+
|
579 |
+
# The above copyright notice and this permission notice shall be included in
|
580 |
+
# all copies or substantial portions of the Software.
|
581 |
+
|
582 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
583 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
584 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
585 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
586 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
587 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
588 |
+
# THE SOFTWARE.
|
589 |
+
# ============================================================================
|
590 |
+
@contextlib.contextmanager
|
591 |
+
def time_limit(seconds: float):
|
592 |
+
def signal_handler(signum, frame):
|
593 |
+
raise TimeoutException("Timed out!")
|
594 |
+
|
595 |
+
signal.setitimer(signal.ITIMER_REAL, seconds)
|
596 |
+
signal.signal(signal.SIGALRM, signal_handler)
|
597 |
+
try:
|
598 |
+
yield
|
599 |
+
finally:
|
600 |
+
signal.setitimer(signal.ITIMER_REAL, 0)
|
601 |
+
|
602 |
+
|
603 |
+
@contextlib.contextmanager
|
604 |
+
def swallow_io():
|
605 |
+
stream = WriteOnlyStringIO()
|
606 |
+
with contextlib.redirect_stdout(stream):
|
607 |
+
with contextlib.redirect_stderr(stream):
|
608 |
+
with redirect_stdin(stream):
|
609 |
+
yield
|
610 |
+
|
611 |
+
|
612 |
+
@contextlib.contextmanager
|
613 |
+
def create_tempdir():
|
614 |
+
with tempfile.TemporaryDirectory() as dirname:
|
615 |
+
with chdir(dirname):
|
616 |
+
yield dirname
|
617 |
+
|
618 |
+
|
619 |
+
class TimeoutException(Exception):
|
620 |
+
pass
|
621 |
+
|
622 |
+
|
623 |
+
class WriteOnlyStringIO(io.StringIO):
|
624 |
+
""" StringIO that throws an exception when it's read from """
|
625 |
+
|
626 |
+
def read(self, *args, **kwargs):
|
627 |
+
raise IOError
|
628 |
+
|
629 |
+
def readline(self, *args, **kwargs):
|
630 |
+
raise IOError
|
631 |
+
|
632 |
+
def readlines(self, *args, **kwargs):
|
633 |
+
raise IOError
|
634 |
+
|
635 |
+
def readable(self, *args, **kwargs):
|
636 |
+
""" Returns True if the IO object can be read. """
|
637 |
+
return False
|
638 |
+
|
639 |
+
|
640 |
+
class redirect_stdin(contextlib._RedirectStream): # type: ignore
|
641 |
+
_stream = 'stdin'
|
642 |
+
|
643 |
+
|
644 |
+
@contextlib.contextmanager
|
645 |
+
def chdir(root):
|
646 |
+
if root == ".":
|
647 |
+
yield
|
648 |
+
return
|
649 |
+
cwd = os.getcwd()
|
650 |
+
os.chdir(root)
|
651 |
+
try:
|
652 |
+
yield
|
653 |
+
except BaseException as exc:
|
654 |
+
raise exc
|
655 |
+
finally:
|
656 |
+
os.chdir(cwd)
|
657 |
+
|
658 |
+
|
659 |
+
def reliability_guard(maximum_memory_bytes: Optional[int] = None):
|
660 |
+
"""
|
661 |
+
This disables various destructive functions and prevents the generated code
|
662 |
+
from interfering with the test (e.g. fork bomb, killing other processes,
|
663 |
+
removing filesystem files, etc.)
|
664 |
+
|
665 |
+
WARNING
|
666 |
+
This function is NOT a security sandbox. Untrusted code, including, model-
|
667 |
+
generated code, should not be blindly executed outside of one. See the
|
668 |
+
Codex paper for more information about OpenAI's code sandbox, and proceed
|
669 |
+
with caution.
|
670 |
+
"""
|
671 |
+
|
672 |
+
if maximum_memory_bytes is not None:
|
673 |
+
import resource
|
674 |
+
resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
|
675 |
+
resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
|
676 |
+
if not platform.uname().system == 'Darwin':
|
677 |
+
resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
|
678 |
+
|
679 |
+
faulthandler.disable()
|
680 |
+
|
681 |
+
import builtins
|
682 |
+
builtins.exit = None
|
683 |
+
builtins.quit = None
|
684 |
+
|
685 |
+
import os
|
686 |
+
os.environ['OMP_NUM_THREADS'] = '1'
|
687 |
+
|
688 |
+
os.kill = None
|
689 |
+
os.system = None
|
690 |
+
os.putenv = None
|
691 |
+
os.remove = None
|
692 |
+
os.removedirs = None
|
693 |
+
os.rmdir = None
|
694 |
+
os.fchdir = None
|
695 |
+
os.setuid = None
|
696 |
+
os.fork = None
|
697 |
+
os.forkpty = None
|
698 |
+
os.killpg = None
|
699 |
+
os.rename = None
|
700 |
+
os.renames = None
|
701 |
+
os.truncate = None
|
702 |
+
os.replace = None
|
703 |
+
os.unlink = None
|
704 |
+
os.fchmod = None
|
705 |
+
os.fchown = None
|
706 |
+
os.chmod = None
|
707 |
+
os.chown = None
|
708 |
+
os.chroot = None
|
709 |
+
os.fchdir = None
|
710 |
+
os.lchflags = None
|
711 |
+
os.lchmod = None
|
712 |
+
os.lchown = None
|
713 |
+
os.getcwd = None
|
714 |
+
os.chdir = None
|
715 |
+
|
716 |
+
import shutil
|
717 |
+
shutil.rmtree = None
|
718 |
+
shutil.move = None
|
719 |
+
shutil.chown = None
|
720 |
+
|
721 |
+
import subprocess
|
722 |
+
subprocess.Popen = None # type: ignore
|
723 |
+
|
724 |
+
__builtins__['help'] = None
|
725 |
+
|
726 |
+
import sys
|
727 |
+
sys.modules['ipdb'] = None
|
728 |
+
sys.modules['joblib'] = None
|
729 |
+
sys.modules['resource'] = None
|
730 |
+
sys.modules['psutil'] = None
|
731 |
+
sys.modules['tkinter'] = None
|
DeepSeek-Coder-main/Evaluation/HumanEval/humaneval.py
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import string
|
3 |
+
import multiprocessing
|
4 |
+
import os
|
5 |
+
import numpy as np
|
6 |
+
import json
|
7 |
+
import re
|
8 |
+
import torch
|
9 |
+
import datetime
|
10 |
+
import subprocess
|
11 |
+
import torch.distributed as dist
|
12 |
+
from attrdict import AttrDict
|
13 |
+
from human_eval.evaluation import evaluate_functional_correctness
|
14 |
+
from transformers import AutoTokenizer
|
15 |
+
from utils.dataset import HumanEvalDataset
|
16 |
+
from utils.utils import cleanup_code
|
17 |
+
|
18 |
+
class HumanEval:
|
19 |
+
"""
|
20 |
+
HumanEval evaluation class.
|
21 |
+
"""
|
22 |
+
def __init__(self, data_root, max_seq_len=2048,
|
23 |
+
language="python", max_gen_len=200, batch_size=512,
|
24 |
+
log_dir=None, temperature=0, issft=False, top_p=0.95,
|
25 |
+
model_name="", inference_increment=True,
|
26 |
+
tokenizer_cfg=None, n_sample=40, k_sample=1):
|
27 |
+
self.data_root = data_root
|
28 |
+
self.max_seq_len = max_seq_len
|
29 |
+
self.max_gen_len = max_gen_len
|
30 |
+
self.batch_size = batch_size
|
31 |
+
self.k = k_sample
|
32 |
+
self.n_sample = n_sample
|
33 |
+
self.language = language
|
34 |
+
self.log_dir = log_dir
|
35 |
+
self.sft = issft
|
36 |
+
self.temperature = temperature
|
37 |
+
self.top_p = top_p
|
38 |
+
self.model_name = tokenizer_cfg["model_path"].replace("/", "_")
|
39 |
+
self.inference_increment = inference_increment
|
40 |
+
os.makedirs(self.log_dir, exist_ok=True)
|
41 |
+
tokenizer_cls = tokenizer_cfg.pop('cls')
|
42 |
+
try:
|
43 |
+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_cfg.pop("model_path"), trust_remote_code=True)
|
44 |
+
except Exception as e:
|
45 |
+
print(e)
|
46 |
+
assert False
|
47 |
+
|
48 |
+
@torch.no_grad()
|
49 |
+
def eval_model(self, gpt, accelerator):
|
50 |
+
"""
|
51 |
+
Evaluate the model on HumanEval.
|
52 |
+
"""
|
53 |
+
assert self.log_dir is not None, "log_dir should not be None when evaluating humaneval"
|
54 |
+
dataset = HumanEvalDataset(self.data_root, sample_num=self.n_sample, language=self.language, issft=self.sft)
|
55 |
+
nprompt = len(dataset) // self.n_sample
|
56 |
+
dp_rank = accelerator.process_index
|
57 |
+
dp_size = accelerator.num_processes
|
58 |
+
if self.k > 1:
|
59 |
+
assert self.n_sample >= 100, "HumanEval PASS@100 needs n_sample >= 100"
|
60 |
+
gpt.eval()
|
61 |
+
# each process will process a subset of the dataset
|
62 |
+
prompt_indices_split = np.array_split(range(nprompt), dp_size)
|
63 |
+
prompt_indices = prompt_indices_split[dp_rank]
|
64 |
+
indices = [x * self.n_sample + j for x in prompt_indices for j in range(self.n_sample)]
|
65 |
+
all_num = len(indices)
|
66 |
+
processed_num = 0
|
67 |
+
log_file = os.path.join(self.log_dir,
|
68 |
+
f'{self.model_name}_rank{dp_rank}_bs{self.batch_size}_shot_log_{self.language}.json')
|
69 |
+
tmpfile = open(log_file, "w")
|
70 |
+
start_time = time.time()
|
71 |
+
# split the dataset into batches and construct a list of inputs
|
72 |
+
for idx in range(0, len(indices), self.batch_size):
|
73 |
+
prompt_list = []
|
74 |
+
prompt_lens = []
|
75 |
+
orriginal_prompt_list = []
|
76 |
+
tokenized_prompt_lens = []
|
77 |
+
taskid = []
|
78 |
+
# get the prompts from the dataset
|
79 |
+
for j in indices[idx:idx + self.batch_size]:
|
80 |
+
data = dataset[j]
|
81 |
+
fprompt = data["prompt"].strip()
|
82 |
+
prompt_list.append(fprompt)
|
83 |
+
tmp = self.tokenizer.encode(fprompt)
|
84 |
+
orriginal_prompt_list.append(data["original_prompt"])
|
85 |
+
prompt_lens.append(len(fprompt))
|
86 |
+
tokenized_prompt_lens.append(tmp)
|
87 |
+
taskid.append(data["task_id"])
|
88 |
+
input_ids = torch.tensor(tokenized_prompt_lens).to(accelerator.device)
|
89 |
+
# generate the code
|
90 |
+
if self.temperature != 0:
|
91 |
+
decoded = gpt.generate(
|
92 |
+
input_ids=input_ids,
|
93 |
+
max_new_tokens=self.max_gen_len,
|
94 |
+
do_sample=True,
|
95 |
+
eos_token_id=self.tokenizer.eos_token_id,
|
96 |
+
temperature=self.temperature,
|
97 |
+
top_p=self.top_p,
|
98 |
+
pad_token_id=self.tokenizer.eos_token_id,
|
99 |
+
)
|
100 |
+
else:
|
101 |
+
decoded = gpt.generate(
|
102 |
+
input_ids=input_ids,
|
103 |
+
max_new_tokens=self.max_gen_len,
|
104 |
+
do_sample=False,
|
105 |
+
eos_token_id=self.tokenizer.eos_token_id,
|
106 |
+
pad_token_id=self.tokenizer.eos_token_id,
|
107 |
+
)
|
108 |
+
# save the results to a file
|
109 |
+
for local_idx, text in enumerate(decoded):
|
110 |
+
prediction = decoded[local_idx]
|
111 |
+
prediction = self.tokenizer.decode(prediction, skip_special_tokens=True)
|
112 |
+
suffixprediction = prediction[prompt_lens[local_idx]:]
|
113 |
+
suffixprediction = cleanup_code(suffixprediction, self.language, "humaneval", self.sft, dataset.stopwords)
|
114 |
+
# sft mode does not need original prompt
|
115 |
+
if not self.sft:
|
116 |
+
suffixprediction = orriginal_prompt_list[local_idx] + "\n" + suffixprediction
|
117 |
+
res = {"task_id": taskid[local_idx], "generation": suffixprediction, "prompt": orriginal_prompt_list[local_idx], "wholecode":prediction}
|
118 |
+
tmpfile.write(json.dumps(res) + "\n")
|
119 |
+
tmpfile.flush()
|
120 |
+
processed_num += 1
|
121 |
+
self.log_score(dp_rank, processed_num, all_num, start_time, self.batch_size)
|
122 |
+
tmpfile.close()
|
123 |
+
accelerator.wait_for_everyone()
|
124 |
+
# calculate the final score of pass@k
|
125 |
+
self._calculate_final_score(accelerator)
|
126 |
+
accelerator.wait_for_everyone()
|
127 |
+
return
|
128 |
+
|
129 |
+
def log_score(self, dp_rank, processed_num, all_num, start_time, bs):
|
130 |
+
"""
|
131 |
+
Log the score.
|
132 |
+
"""
|
133 |
+
mem = torch.cuda.max_memory_allocated() / (1 << 30)
|
134 |
+
avg_time = (time.time() - start_time) / processed_num * bs
|
135 |
+
print(
|
136 |
+
f'DP RANK:{dp_rank} process_num/all_num:{int(processed_num)}/{all_num} '
|
137 |
+
f'avg_time_per_batch:{avg_time:.2f} s '
|
138 |
+
f'still_need:{((all_num - processed_num) // bs + 1) * avg_time / 60:.2f} m',
|
139 |
+
f'mem:{mem:.3f} GiB bs:{bs}',
|
140 |
+
flush=True
|
141 |
+
)
|
142 |
+
if processed_num == all_num:
|
143 |
+
print(f'EVAL DONE! Process time {(time.time() - start_time) / 60:.2f} m', flush=True)
|
144 |
+
|
145 |
+
def _calculate_final_score(self, accelerator):
|
146 |
+
"""
|
147 |
+
Calculate the final score.
|
148 |
+
"""
|
149 |
+
if accelerator.is_local_main_process:
|
150 |
+
logfilepath = os.path.join(self.log_dir, f'final_{self.model_name}.jsonl')
|
151 |
+
logfile = open(logfilepath, "w")
|
152 |
+
for i in range(accelerator.num_processes):
|
153 |
+
tmplogfile = os.path.join(self.log_dir, f'{self.model_name}_rank{i}_bs{self.batch_size}_shot_log_{self.language}.json')
|
154 |
+
logfile.write(open(tmplogfile).read().strip() + "\n")
|
155 |
+
os.remove(tmplogfile)
|
156 |
+
logfile.close()
|
157 |
+
timeout = 10
|
158 |
+
runlang = self.language
|
159 |
+
res = evaluate_functional_correctness(input_file=logfilepath, problem_file=os.path.join(self.data_root, f"humaneval-{self.language}.jsonl"), tmp_dir=self.log_dir, timeout=timeout, language=runlang)
|
160 |
+
print("score is", res['pass@%d' % self.k])
|
161 |
+
os.remove(logfilepath)
|
162 |
+
return
|
163 |
+
|
DeepSeek-Coder-main/Evaluation/HumanEval/javatuples-1.2.jar
ADDED
Binary file (65.5 kB). View file
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/test_config.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
compute_environment: LOCAL_MACHINE
|
2 |
+
distributed_type: MULTI_GPU
|
3 |
+
downcast_bf16: 'no'
|
4 |
+
gpu_ids: all
|
5 |
+
machine_rank: 0
|
6 |
+
main_training_function: main
|
7 |
+
mixed_precision: 'no'
|
8 |
+
num_machines: 1
|
9 |
+
num_processes: 3
|
10 |
+
rdzv_backend: static
|
11 |
+
same_network: true
|
12 |
+
tpu_env: []
|
13 |
+
tpu_use_cluster: false
|
14 |
+
tpu_use_sudo: false
|
15 |
+
use_cpu: false
|
DeepSeek-Coder-main/Evaluation/HumanEval/utils/__pycache__/dataset.cpython-38.pyc
ADDED
Binary file (2.48 kB). View file
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/utils/__pycache__/instruct.cpython-38.pyc
ADDED
Binary file (2.87 kB). View file
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/utils/__pycache__/utils.cpython-38.pyc
ADDED
Binary file (1.24 kB). View file
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/utils/__pycache__/utils.cpython-39.pyc
ADDED
Binary file (3.49 kB). View file
|
|
DeepSeek-Coder-main/Evaluation/HumanEval/utils/dataset.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
import json
|
4 |
+
|
5 |
+
class HumanEvalDataset:
|
6 |
+
|
7 |
+
def __init__(self, root, sample_num=1, language="python", issft=False):
|
8 |
+
"""
|
9 |
+
root: the path to the HumanEval dataset
|
10 |
+
sample_num: the number of samples for each prompt
|
11 |
+
language: the language of the HumanEval dataset
|
12 |
+
issft: whether to use the SFT setting
|
13 |
+
"""
|
14 |
+
self.root = root
|
15 |
+
self.data = open(os.path.join(self.root, f"humaneval-{language}.jsonl")).readlines()
|
16 |
+
|
17 |
+
tmp = self.get_qa_only_data(self.data, issft)
|
18 |
+
self.clean_data = []
|
19 |
+
for i in range(len(tmp)):
|
20 |
+
for j in range(sample_num):
|
21 |
+
self.clean_data.append(tmp[i])
|
22 |
+
self.stopwords = self.clean_data[0]["stopwords"]
|
23 |
+
np.random.seed(1234)
|
24 |
+
print(f"Read HumanEval from {root}, number of samples {len(self.clean_data)}")
|
25 |
+
|
26 |
+
def get_qa_only_data(self, data_json, sft=False):
|
27 |
+
"""
|
28 |
+
data_json: the jsonl file of HumanEval
|
29 |
+
sft: whether to use the SFT setting
|
30 |
+
return: a list of dict, each dict contains the prompt, task_id and stopwords
|
31 |
+
"""
|
32 |
+
ans = []
|
33 |
+
for line in data_json:
|
34 |
+
line = json.loads(line)
|
35 |
+
prompt = line["prompt"].strip()
|
36 |
+
if "prefix" in line:
|
37 |
+
origin_prompt = line["prefix"]
|
38 |
+
else:
|
39 |
+
origin_prompt = line["prompt"]
|
40 |
+
|
41 |
+
if sft:
|
42 |
+
prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context.\nWrite a response that appropriately completes the request.\n\n### Instruction:\nWrite a program to perform the given task.\n\nInput:\n{prompt}\n\n### Response:\n"""
|
43 |
+
if "stop_tokens" in line:
|
44 |
+
s = line["stop_tokens"]
|
45 |
+
else:
|
46 |
+
s = []
|
47 |
+
ans.append({"prompt":prompt, "task_id":line["task_id"], "original_prompt": origin_prompt, "stopwords":s})
|
48 |
+
return ans
|
49 |
+
|
50 |
+
def __len__(self):
|
51 |
+
"""
|
52 |
+
return the number of samples in the dataset
|
53 |
+
"""
|
54 |
+
return len(self.clean_data)
|
55 |
+
|
56 |
+
def __getitem__(self, index):
|
57 |
+
"""
|
58 |
+
return the sample at index
|
59 |
+
"""
|
60 |
+
sample = self.clean_data[index]
|
61 |
+
return sample
|
DeepSeek-Coder-main/Evaluation/HumanEval/utils/utils.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
languge_settings = {
|
4 |
+
'python': {
|
5 |
+
'full_name': 'Python',
|
6 |
+
'indent': 4,
|
7 |
+
},
|
8 |
+
'cpp': {
|
9 |
+
'full_name': 'cpp',
|
10 |
+
'indent': 0,
|
11 |
+
'main': "int main()",
|
12 |
+
},
|
13 |
+
'java': {
|
14 |
+
'full_name': 'Java',
|
15 |
+
'indent': 4,
|
16 |
+
'main': "public static void main",
|
17 |
+
},
|
18 |
+
'cs': {
|
19 |
+
'full_name': "csharp",
|
20 |
+
'indent': 0,
|
21 |
+
'main': "public static void Main",
|
22 |
+
},
|
23 |
+
'php': {
|
24 |
+
'full_name': "PHP",
|
25 |
+
'indent': 0,
|
26 |
+
},
|
27 |
+
'ts': {
|
28 |
+
'full_name': "TypeScript",
|
29 |
+
'indent': 0,
|
30 |
+
},
|
31 |
+
'js': {
|
32 |
+
'full_name': "JavaScript",
|
33 |
+
'indent': 0
|
34 |
+
},
|
35 |
+
'sh': {
|
36 |
+
'full_name': "Bash",
|
37 |
+
'indent': 0
|
38 |
+
}
|
39 |
+
}
|
40 |
+
|
41 |
+
def get_function_name(question: str, lang: str):
|
42 |
+
func_lines = [x for x in question.strip().split('\n') if x.strip()]
|
43 |
+
|
44 |
+
if lang.lower() == 'python':
|
45 |
+
func_idx = [i for i in range(len(func_lines)) if func_lines[i].startswith("def ")][-1]
|
46 |
+
func_name = func_lines[func_idx].split('(')[0].strip()
|
47 |
+
func_prefix = "\n".join(func_lines[:func_idx])
|
48 |
+
return func_name, func_prefix
|
49 |
+
|
50 |
+
func_name = func_lines[-1].split('{')[0].strip()
|
51 |
+
func_prefix = "\n".join(func_lines[:-1])
|
52 |
+
return func_name, func_prefix
|
53 |
+
|
54 |
+
def extract_generation_code(example: str, lang_code: str, verbose: bool=False):
|
55 |
+
task_id = example['task_id']
|
56 |
+
output = example.get('output', example.get("gpt_completion"))
|
57 |
+
question = example["prompt"].strip()
|
58 |
+
setting = languge_settings[lang_code]
|
59 |
+
lang = setting['full_name']
|
60 |
+
indent = setting['indent']
|
61 |
+
|
62 |
+
try:
|
63 |
+
code_block: str = re.findall(f'```{lang.lower()}\n(.*?)```', output, re.DOTALL | re.IGNORECASE)[0]
|
64 |
+
if verbose:
|
65 |
+
print(">>> Task: {}\n{}".format(task_id, code_block))
|
66 |
+
|
67 |
+
# Remove main
|
68 |
+
if setting.get('main', None) and setting['main'] in code_block:
|
69 |
+
main_start = code_block.index(setting['main'])
|
70 |
+
code_block = code_block[:main_start]
|
71 |
+
|
72 |
+
func_name, func_prefix = get_function_name(question, lang)
|
73 |
+
|
74 |
+
try:
|
75 |
+
start = code_block.lower().index(func_name.lower())
|
76 |
+
indent = 0
|
77 |
+
while start - indent >= 0 and code_block[start - indent-1] == ' ':
|
78 |
+
indent += 1
|
79 |
+
|
80 |
+
try:
|
81 |
+
end = code_block.rindex('\n' + ' '*indent + '}')
|
82 |
+
except:
|
83 |
+
end = len(code_block)
|
84 |
+
except:
|
85 |
+
start = 0
|
86 |
+
try:
|
87 |
+
end = code_block.rindex('\n' + ' '*indent + '}')
|
88 |
+
except:
|
89 |
+
end = len(code_block)
|
90 |
+
|
91 |
+
body = code_block[start:end]
|
92 |
+
|
93 |
+
if lang_code.lower() in ['php', 'ts', 'js']:
|
94 |
+
body += '\n' + ' '*indent + '}'
|
95 |
+
|
96 |
+
generation = func_prefix + '\n' + body + '\n'
|
97 |
+
example['generation'] = generation
|
98 |
+
|
99 |
+
except Exception as ex:
|
100 |
+
print("Failed to extract code block with error `{}`:\n>>> Task: {}\n>>> Output:\n{}".format(
|
101 |
+
ex, task_id, output
|
102 |
+
))
|
103 |
+
example['generation'] = example['prompt'] + '\n' + output
|
104 |
+
|
105 |
+
return example
|
106 |
+
|
107 |
+
def cleanup_code(
|
108 |
+
code: str,
|
109 |
+
language_type: str = None,
|
110 |
+
dataset: str = None,
|
111 |
+
issft: bool = False,
|
112 |
+
stop_words = []
|
113 |
+
):
|
114 |
+
"""
|
115 |
+
Cleans up the generated code.
|
116 |
+
"""
|
117 |
+
|
118 |
+
if language_type.lower() == "python":
|
119 |
+
if issft:
|
120 |
+
code = _clean_python_code_for_sft(code)
|
121 |
+
stop_words = ["\ndef", "\nclass", "\nif", "\n#", "\nprint"]
|
122 |
+
code = _truncate_code_at_stopwords(code, stop_words)
|
123 |
+
elif language_type.lower() == "ts":
|
124 |
+
code = _truncate_code_at_stopwords(code, stop_words + ["\nexport", "\nimport", "\nexport default", "\nimport default", "\nconsole.log"])
|
125 |
+
else:
|
126 |
+
code = _truncate_code_at_stopwords(code, stop_words)
|
127 |
+
|
128 |
+
return code
|
129 |
+
|
130 |
+
def _clean_python_code_for_sft(code):
|
131 |
+
code = code.replace("\r", "")
|
132 |
+
if "```python" in code:
|
133 |
+
code_start_idx = code.index("```python")
|
134 |
+
code = code[code_start_idx:].replace("```python", "").strip()
|
135 |
+
end_idx = code.find("```") if "```" in code else len(code)
|
136 |
+
code = code[:end_idx].strip()
|
137 |
+
|
138 |
+
return code
|
139 |
+
|
140 |
+
def _truncate_code_at_stopwords(code, stop_words):
|
141 |
+
min_stop_idx = len(code)
|
142 |
+
for stop_word in stop_words:
|
143 |
+
stop_index = code.find(stop_word)
|
144 |
+
if 0 <= stop_index < min_stop_idx:
|
145 |
+
min_stop_idx = stop_index
|
146 |
+
return code[:min_stop_idx]
|
DeepSeek-Coder-main/Evaluation/LeetCode/data/20240121-Jul-zh.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|