ordlibrary commited on
Commit
f1e6b80
·
verified ·
1 Parent(s): 6a0e7ea

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +35 -0
  2. DeepSeek-Coder-main/.DS_Store +0 -0
  3. DeepSeek-Coder-main/.gitignore +3 -0
  4. DeepSeek-Coder-main/Evaluation/DS-1000/README.md +29 -0
  5. DeepSeek-Coder-main/Evaluation/HumanEval/README.md +74 -0
  6. DeepSeek-Coder-main/Evaluation/HumanEval/__pycache__/humaneval.cpython-38.pyc +0 -0
  7. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-cpp +0 -0
  8. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-cpp.jsonl +0 -0
  9. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-cs +0 -0
  10. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-cs-bu.jsonl +0 -0
  11. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-cs.jsonl +0 -0
  12. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-d.jsonl +0 -0
  13. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-go.jsonl +0 -0
  14. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-java +0 -0
  15. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-java.jsonl +0 -0
  16. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-jl.jsonl +0 -0
  17. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-js.jsonl +0 -0
  18. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-lua.jsonl +0 -0
  19. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-php +0 -0
  20. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-php.jsonl +0 -0
  21. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-pl.jsonl +0 -0
  22. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-python.jsonl +0 -0
  23. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-r.jsonl +0 -0
  24. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-rb.jsonl +0 -0
  25. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-rkt.jsonl +0 -0
  26. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-rs.jsonl +0 -0
  27. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-scala.jsonl +0 -0
  28. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-sh +0 -0
  29. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-sh.jsonl +0 -0
  30. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-swift.jsonl +0 -0
  31. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-ts +0 -0
  32. DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-ts.jsonl +0 -0
  33. DeepSeek-Coder-main/Evaluation/HumanEval/eval.sh +4 -0
  34. DeepSeek-Coder-main/Evaluation/HumanEval/eval_instruct.py +129 -0
  35. DeepSeek-Coder-main/Evaluation/HumanEval/eval_pal.py +42 -0
  36. DeepSeek-Coder-main/Evaluation/HumanEval/human_eval/__init__.py +0 -0
  37. DeepSeek-Coder-main/Evaluation/HumanEval/human_eval/data.py +49 -0
  38. DeepSeek-Coder-main/Evaluation/HumanEval/human_eval/evaluate_functional_correctness.py +29 -0
  39. DeepSeek-Coder-main/Evaluation/HumanEval/human_eval/evaluation.py +296 -0
  40. DeepSeek-Coder-main/Evaluation/HumanEval/human_eval/execution.py +731 -0
  41. DeepSeek-Coder-main/Evaluation/HumanEval/humaneval.py +163 -0
  42. DeepSeek-Coder-main/Evaluation/HumanEval/javatuples-1.2.jar +0 -0
  43. DeepSeek-Coder-main/Evaluation/HumanEval/test_config.yaml +15 -0
  44. DeepSeek-Coder-main/Evaluation/HumanEval/utils/__pycache__/dataset.cpython-38.pyc +0 -0
  45. DeepSeek-Coder-main/Evaluation/HumanEval/utils/__pycache__/instruct.cpython-38.pyc +0 -0
  46. DeepSeek-Coder-main/Evaluation/HumanEval/utils/__pycache__/utils.cpython-38.pyc +0 -0
  47. DeepSeek-Coder-main/Evaluation/HumanEval/utils/__pycache__/utils.cpython-39.pyc +0 -0
  48. DeepSeek-Coder-main/Evaluation/HumanEval/utils/dataset.py +61 -0
  49. DeepSeek-Coder-main/Evaluation/HumanEval/utils/utils.py +146 -0
  50. DeepSeek-Coder-main/Evaluation/LeetCode/data/20240121-Jul-zh.jsonl +0 -0
.gitattributes CHANGED
@@ -33,3 +33,38 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ DeepSeek-Coder-main/Evaluation/PAL-Math/datasets/tabmwp/test.json filter=lfs diff=lfs merge=lfs -text
37
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/numpy/_core/_multiarray_umath.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
38
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pandas/_libs/algos.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
39
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pandas/_libs/groupby.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
40
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pandas/_libs/hashtable.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
41
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pandas/_libs/interval.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
42
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pandas/_libs/join.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
43
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pandas/_libs/tslibs/offsets.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
44
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/_compute.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
45
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/_flight.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
46
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/lib.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
47
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/libarrow.1900.dylib filter=lfs diff=lfs merge=lfs -text
48
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/libarrow_acero.1900.dylib filter=lfs diff=lfs merge=lfs -text
49
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/libarrow_dataset.1900.dylib filter=lfs diff=lfs merge=lfs -text
50
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/libarrow_flight.1900.dylib filter=lfs diff=lfs merge=lfs -text
51
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/libarrow_python.1900.0.0.dylib filter=lfs diff=lfs merge=lfs -text
52
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/libarrow_python.1900.dylib filter=lfs diff=lfs merge=lfs -text
53
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/libarrow_python.dylib filter=lfs diff=lfs merge=lfs -text
54
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/libarrow_substrait.1900.dylib filter=lfs diff=lfs merge=lfs -text
55
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pyarrow/libparquet.1900.dylib filter=lfs diff=lfs merge=lfs -text
56
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/pydantic_core/_pydantic_core.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
57
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/sympy/polys/benchmarks/__pycache__/bench_solvers.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text
58
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/tokenizers/tokenizers.abi3.so filter=lfs diff=lfs merge=lfs -text
59
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/torch/bin/protoc filter=lfs diff=lfs merge=lfs -text
60
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/torch/bin/protoc-3.13.0.0 filter=lfs diff=lfs merge=lfs -text
61
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/torch/lib/libtorch_cpu.dylib filter=lfs diff=lfs merge=lfs -text
62
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/torch/lib/libtorch_python.dylib filter=lfs diff=lfs merge=lfs -text
63
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/wandb/bin/gpu_stats filter=lfs diff=lfs merge=lfs -text
64
+ DeepSeek-Coder-main/finetune/venv/lib/python3.12/site-packages/wandb/bin/wandb-core filter=lfs diff=lfs merge=lfs -text
65
+ DeepSeek-Coder-main/pictures/home.tif filter=lfs diff=lfs merge=lfs -text
66
+ venv/lib/python3.12/site-packages/numpy/_core/_multiarray_umath.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
67
+ venv/lib/python3.12/site-packages/pydantic_core/_pydantic_core.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
68
+ venv/lib/python3.12/site-packages/tiktoken/_tiktoken.cpython-312-darwin.so filter=lfs diff=lfs merge=lfs -text
69
+ venv/lib/python3.12/site-packages/wandb/bin/gpu_stats filter=lfs diff=lfs merge=lfs -text
70
+ venv/lib/python3.12/site-packages/wandb/bin/wandb-core filter=lfs diff=lfs merge=lfs -text
DeepSeek-Coder-main/.DS_Store ADDED
Binary file (6.15 kB). View file
 
DeepSeek-Coder-main/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __pycache__/
2
+ Evaluation/MBPP/eval_instruct.sh
3
+ Evaluation/LeetCode/output/
DeepSeek-Coder-main/Evaluation/DS-1000/README.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## 1. Introduction
2
+
3
+ We provide a test script to evaluate the performance of the **deepseek-coder** model on code completion benchmarks. We select the widely-used benchmarks: [**DS-1000**](https://github.com/xlang-ai/DS-1000).
4
+
5
+ ## 2. Evaluation
6
+
7
+ We directly use the scripts provided by the DS-1000 repository to evaluate the performance of the models. You can refer to [**DS-1000**](https://github.com/xlang-ai/DS-1000) to find more details about the evaluation.
8
+
9
+
10
+ ## 3. Experimental Results
11
+
12
+ We report experimental results here for the completion mode of DS-1000. We set the maximum length to **2048**, and employ the **greedy search strategy**. To ensure a fair comparison, we apply identical hyper-parameters across all open-source models under evaluation.
13
+
14
+ | Model | Size | Matplotlib | Numpy | Pandas | Pytorch | Scipy | Scikit-Learn | Tensorflow | Avg |
15
+ |------------------------|------|------------|-------|--------|---------|-------|-------------|------------|-------|
16
+ | Codex-001 | - | 41.8% | 26.6% | 9.4% | 9.7% | 15.0% | 18.5% | 17.2% | 20.2% |
17
+ | Codex-002 | - | **57.0%** | 43.1% | **26.5%** | **41.8%** | 31.8% | **44.8%** | 39.3% | 39.2% |
18
+ | CodeShell | 7B | 34.1% | 21.8% | 10.7% | 11.8% | 17.0% | 20.0% | 15.6% | 18.8% |
19
+ | CodeGeeX2 | 6B | 38.7% | 26.8% | 14.4% | 11.8% | 19.8% | 27.0% | 17.8% | 22.9% |
20
+ | StarCoder | 16B | 47.7% | 31.4% | 12.7% | 25% | 22.6% | 35.7% | 22.2% | 27.2% |
21
+ | CodeLLama-Base | 7B | 41.9% | 24.6% | 14.8% | 16.2% | 18.9% | 17.4% | 17.8% | 22.1% |
22
+ | CodeLLama-Base | 13B | 46.5% | 28.6% | 18.2% | 19.1% | 18.9% | 27.8% | 33.3% | 26.8% |
23
+ | CodeLLama-Base | 34B | 50.3% | 42.7% | 23.0% | 25.0% | 28.3% | 33.9% | 40.0% | 34.3% |
24
+ | | | | | | | | | | | |
25
+ | DeepSeek-Coder-Base | 1.3B | 32.3% | 21.4% | 9.3% | 8.8% | 8.5% | 16.5% | 8.9% | 16.2% |
26
+ | DeepSeek-Coder-Base | 5.7B | 51.1% | 31.8% | 19.9% | 14.7% | 17.0% | 29.6% | 15.6% | 27.7% |
27
+ | DeepSeek-Coder-Base | 6.7B | 48.4% | 35.5% | 20.6% | 19.1% | 22.6% | 38.3% | 24.4% | 30.5% |
28
+ | DeepSeek-Coder-Base | 33B | 56.1% | **49.6%** | 25.8% | 36.8% | **36.8%** | 40.0% | **46.7%** | **40.2%** |
29
+
DeepSeek-Coder-main/Evaluation/HumanEval/README.md ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## 1. Introduction
2
+
3
+ We provide a test script to evaluate the performance of the **deepseek-coder** model on code generation benchmarks. We select the widely-used benchmarks: **[HumanEval-Python](https://huggingface.co/datasets/openai_humaneval), [HumanEval-Multilingual](https://huggingface.co/datasets/nuprl/MultiPL-E)**.
4
+
5
+
6
+
7
+ ## 2. Setup
8
+
9
+ ```
10
+ pip install accelerate
11
+ pip install attrdict
12
+ pip install transformers
13
+ pip install pytorch
14
+ ```
15
+
16
+
17
+ ## 3. Evaluation
18
+
19
+ We've created a sample script, **eval.sh**, that demonstrates how to test the **DeepSeek-Coder-1.3b-Base** model on the HumanEval dataset leveraging **8** GPUs. If your use case involves a different model or dataset, simply adjust the script to fit your needs.
20
+
21
+ Additionally, for various programming languages, the execution path may differ. Please ensure you update the appropriate paths in the **humaneval/execution.py** file accordingly.
22
+
23
+ ```bash
24
+ MODEL_NAME_OR_PATH="deepseek-ai/deepseek-coder-1.3b-base"
25
+ DATASET_ROOT="data/"
26
+ LANGUAGE="python"
27
+ python -m accelerate.commands.launch --config_file test_config.yaml eval_pal.py --logdir ${MODEL_NAME_OR_PATH} --language ${LANGUAGE} --dataroot ${DATASET_ROOT}
28
+ ```
29
+
30
+ To evaluate the instruction-based model, please follow the script below:
31
+ ```bash
32
+ LANG="python"
33
+ OUPUT_DIR="output"
34
+ MODEL="deepseek-coder-33b-instruct"
35
+
36
+ CUDA_VISIBLE_DEVICES=0,1 python eval_instruct.py \
37
+ --model "deepseek-ai/$MODEL" \
38
+ --output_path "$OUPUT_DIR/${LANG}.$MODEL.jsonl" \
39
+ --language $LANG \
40
+ --temp_dir $OUPUT_DIR
41
+ ```
42
+
43
+ ## 4. Experimental Results
44
+
45
+ We report experimental results here for 8 main-stream programming languages, **python**, **c++**, **java**, **PHP**, **TypeScript**, **C#**, **Bash**, and **JavaScript**. For all open-source models, we utilize this repository to obtain the performance of the models on the HumanEval dataset. We set the maximum input length to **4096** and the maximum output length to **500**, and employ the **greedy search strategy**.
46
+
47
+
48
+ #### (1) Multilingual Base Models
49
+
50
+ | Model | Size | Python | C++ | Java | PHP | TS | C# | Bash | JS | Avg |
51
+ |-------------------|------|--------|-------|------|------|------|------|------|------|------|
52
+ | code-cushman-001 | 12B | 33.5% | 31.9% | 30.6%| 28.9%| 31.3%| 22.1%| 11.7%| - | - |
53
+ | CodeShell | 7B | 35.4% | 32.9% | 34.2%| 31.7%| 30.2%| 38.0%| 7.0% | 33.5%| 30.4%|
54
+ | CodeGeeX2 | 6B | 36.0% | 29.2% | 25.9%| 23.6%| 20.8%| 29.7%| 6.3% | 24.8%| 24.5%|
55
+ | StarCoderBase | 16B | 31.7% | 31.1% | 28.5%| 25.4%| 34.0%| 34.8%| 8.9% | 29.8%| 28.0%|
56
+ | CodeLLama | 7B | 31.7% | 29.8% | 34.2%| 23.6%| 36.5%| 36.7%| 12.0%| 29.2%| 29.2%|
57
+ | CodeLLama | 13B | 36.0% | 37.9% | 38.0%| 34.2%| 45.2%| 43.0%| 16.5%| 32.3%| 35.4%|
58
+ | CodeLLama | 34B | 48.2% | 44.7% | 44.9%| 41.0%| 42.1%| 48.7%| 15.8%| 42.2%| 41.0%|
59
+ | | | | | | | | | | | |
60
+ | DeepSeek-Coder-Base| 1.3B | 34.8% | 31.1% | 32.3%| 24.2%| 28.9%| 36.7%| 10.1%| 28.6%| 28.3%|
61
+ | DeepSeek-Coder-Base| 5.7B | 48.7% | 45.3% | 41.1%| 39.7%| 44.7%| 41.1%| 27.8%| 42.2%| 41.3%|
62
+ | DeepSeek-Coder-Base| 6.7B | 49.4% | 50.3% | 43.0%| 38.5%| 49.7%| 50.0%| 28.5%| 48.4%| 44.7%|
63
+ | DeepSeek-Coder-Base|33B | **56.1%** | **58.4%** | **51.9%**| **44.1%**| **52.8%**| **51.3%**| **32.3%**| **55.3%**| **50.3%**|
64
+
65
+ #### (2) Instruction-Tuned Models
66
+ | Model | Size | Python | C++ | Java | PHP | TS | C# | Bash | JS | Avg |
67
+ |---------------------|------|--------|-------|------|------|------|------|------|------|------|
68
+ | GPT-3.5-Turbo | - | 76.2% | 63.4% | 69.2%| 60.9%| 69.1%| 70.8%| 42.4%| 67.1%| 64.9%|
69
+ | GPT-4 | - | **84.1%** | **76.4%** | **81.6%**| **77.2%**| **77.4%**| **79.1%**| **58.2%**| **78.0%**| **76.5%**|
70
+ | | | | | | | | | | | |
71
+ | DeepSeek-Coder-Instruct | 1.3B | 65.2% | 45.3% | 51.9% | 45.3% | 59.7% |55.1% | 12.7% | 52.2% | 48.4% |
72
+ | DeepSeek-Coder-Instruct | 6.7B | 78.9% | 63.4% | 68.4% | 68.9%| 67.2%| 72.8%| 36.7%| 72.7%| 66.1%|
73
+ | DeepSeek-Coder-Instruct | 33B | **79.3%** | **68.9%** | **73.4%** | **72.7%**| **67.9%**| **74.1%**| **43.0%**| **73.9%**| **69.2%**|
74
+
DeepSeek-Coder-main/Evaluation/HumanEval/__pycache__/humaneval.cpython-38.pyc ADDED
Binary file (5.38 kB). View file
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-cpp ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-cpp.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-cs ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-cs-bu.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-cs.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-d.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-go.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-java ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-java.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-jl.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-js.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-lua.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-php ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-php.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-pl.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-python.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-r.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-rb.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-rkt.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-rs.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-scala.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-sh ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-sh.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-swift.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-ts ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/data/humaneval-ts.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
DeepSeek-Coder-main/Evaluation/HumanEval/eval.sh ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ MODEL_NAME_OR_PATH="deepseek/deepseek-coder-1b"
2
+ DATASET_ROOT="data/"
3
+ LANGUAGE="python"
4
+ CUDA_VISIBLE_DEVICES=1,2,3 python -m accelerate.commands.launch --config_file test_config.yaml eval_pal.py --logdir ${MODEL_NAME_OR_PATH} --language ${LANGUAGE} --dataroot ${DATASET_ROOT}
DeepSeek-Coder-main/Evaluation/HumanEval/eval_instruct.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ import torch
5
+ from pathlib import Path
6
+ from tqdm import tqdm
7
+
8
+ data_abs_dir = Path(__file__).parent / "data"
9
+
10
+ from utils.utils import extract_generation_code, languge_settings
11
+ from transformers import AutoTokenizer, AutoModelForCausalLM
12
+ from human_eval.evaluation import evaluate_functional_correctness
13
+
14
+ def build_deepseekcoder_instruction(languge: str, question: str):
15
+ return '''
16
+ Please continue to complete the function. You are not allowed to modify the given code and do the completion only. Please return all completed function in a codeblock. Here is the given code to do completion:
17
+ ```{}
18
+ {}
19
+ ```
20
+ '''.strip().format(languge.lower(), question.strip())
21
+
22
+ def generate_one(example, lang, tokenizer, model):
23
+ prompt = build_deepseekcoder_instruction(languge_settings[lang]['full_name'], example['prompt'])
24
+ inputs = tokenizer.apply_chat_template(
25
+ [{'role': 'user', 'content': prompt }],
26
+ return_tensors="pt",
27
+ add_generation_prompt=True
28
+ ).to(model.device)
29
+
30
+ stop_id = tokenizer.convert_tokens_to_ids("<|EOT|>")
31
+ assert isinstance(stop_id, int), "Invalid tokenizer, EOT id not found"
32
+
33
+ outputs = model.generate(
34
+ inputs,
35
+ max_new_tokens=1024,
36
+ do_sample=False,
37
+ # top_p=0.95,
38
+ # temperature=temperature,
39
+ pad_token_id=stop_id,
40
+ eos_token_id=stop_id
41
+ )
42
+
43
+ output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
44
+ example['output'] = output
45
+
46
+ return extract_generation_code(example, lang_code=lang)
47
+
48
+ def generate_main(args):
49
+ model_name_or_path = args.model
50
+ lang = args.language
51
+ saved_path = args.output_path
52
+ temp_dir = args.temp_dir
53
+ os.makedirs(temp_dir, exist_ok=True)
54
+ problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
55
+
56
+ print("model", model_name_or_path)
57
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
58
+ print("load tokenizer {} from {} over.".format(tokenizer.__class__, model_name_or_path))
59
+ model = AutoModelForCausalLM.from_pretrained(
60
+ model_name_or_path,
61
+ torch_dtype=torch.bfloat16,
62
+ device_map="auto",
63
+ #use_flash_attention_2=True
64
+ )
65
+ model.eval()
66
+ examples = [json.loads(x) for x in open(problem_file) if x.strip()]
67
+ print("Read {} examples for evaluation over.".format(len(examples)))
68
+
69
+ generated_examples = []
70
+ for ex in tqdm(examples, desc='Generating'):
71
+ gen_example = generate_one(ex, args.language, tokenizer, model)
72
+ generated_examples.append(gen_example)
73
+
74
+ print("Generate all over!!!")
75
+ with open(saved_path, 'w', encoding='utf-8') as fw:
76
+ for ex in generated_examples:
77
+ fw.write(json.dumps(ex) + '\n')
78
+ print("Save {} processed examples into {} over!".format(len(generated_examples), saved_path))
79
+
80
+ result = evaluate_functional_correctness(
81
+ input_file=saved_path,
82
+ tmp_dir=temp_dir,
83
+ n_workers=8,
84
+ timeout=3.0,
85
+ problem_file=problem_file,
86
+ language=lang
87
+ )
88
+ print(lang, result, model_name_or_path)
89
+ pass
90
+
91
+ def evaluation_only(args):
92
+ lang = args.language
93
+ temp_dir = args.temp_dir
94
+ assert os.path.exists(args.output_path), "Not fond output file: {}".format(args.output_path)
95
+ os.makedirs(temp_dir, exist_ok=True)
96
+
97
+ output_name = os.path.basename(args.output_path)
98
+ output_examples = [json.loads(x) for x in open(args.output_path) if x.strip()]
99
+
100
+ processed_examples = [extract_generation_code(ex, lang) for ex in tqdm(output_examples, "Processing")]
101
+ processed_path = os.path.join(temp_dir, output_name)
102
+ with open(processed_path, 'w', encoding='utf-8') as fw:
103
+ for ex in processed_examples:
104
+ fw.write(json.dumps(ex) + '\n')
105
+ print("Save {} processed examples into {} over!".format(len(processed_examples), processed_path))
106
+
107
+ problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
108
+ from human_eval.evaluation import evaluate_functional_correctness
109
+ result = evaluate_functional_correctness(
110
+ input_file=processed_path,
111
+ tmp_dir=temp_dir,
112
+ n_workers=8,
113
+ timeout=3.0,
114
+ problem_file=problem_file,
115
+ language=lang
116
+ )
117
+ print(lang, result)
118
+
119
+ if __name__ == '__main__':
120
+ parser = argparse.ArgumentParser()
121
+ parser.add_argument('--model', type=str, help="model name or path")
122
+ parser.add_argument('--output_path', type=str, help="output path of your generation")
123
+ parser.add_argument('--language', type=str, help="langauge")
124
+ parser.add_argument('--temp_dir', type=str, help="temp dir for evaluation", default="tmp")
125
+ args = parser.parse_args()
126
+
127
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
128
+ generate_main(args)
129
+ pass
DeepSeek-Coder-main/Evaluation/HumanEval/eval_pal.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import pandas as pd
4
+ import torch
5
+ import torch.nn.functional as F
6
+ import json
7
+ import torch.distributed as dist
8
+ import subprocess
9
+ import sys
10
+ from accelerate import Accelerator
11
+ from accelerate import DistributedDataParallelKwargs
12
+ from pathlib import Path
13
+ from argparse import ArgumentParser
14
+ from humaneval import HumanEval as evaltor
15
+ from transformers import AutoTokenizer, AutoModelForCausalLM
16
+
17
+ if __name__ == '__main__':
18
+ kwargs_handlers = [DistributedDataParallelKwargs(find_unused_parameters=True)]
19
+ accelerator = Accelerator(mixed_precision="bf16", kwargs_handlers=kwargs_handlers)
20
+
21
+
22
+ parser = ArgumentParser()
23
+ parser.add_argument("--logdir", type=str, default="")
24
+ parser.add_argument("--language", type=str, default="")
25
+ parser.add_argument("--dataroot", type=str, default="")
26
+ args = parser.parse_args()
27
+
28
+ logdir = args.logdir
29
+ language = args.language
30
+
31
+ if logdir == "":
32
+ logdir = "tmp/"
33
+ tokenizer = dict(
34
+ cls=AutoTokenizer,
35
+ model_path=logdir,)
36
+
37
+ dataroot = args.dataroot
38
+
39
+ evaluator = evaltor(data_root=dataroot, max_seq_len=4096, tokenizer_cfg=tokenizer, log_dir=logdir, n_sample=1, batch_size=1, language=language, max_gen_len=500)
40
+ model = AutoModelForCausalLM.from_pretrained(logdir, device_map=accelerator.device, trust_remote_code=True, torch_dtype=torch.bfloat16)
41
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
42
+ evaluator.eval_model(model, accelerator)
DeepSeek-Coder-main/Evaluation/HumanEval/human_eval/__init__.py ADDED
File without changes
DeepSeek-Coder-main/Evaluation/HumanEval/human_eval/data.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Iterable, Dict
2
+ import gzip
3
+ import json
4
+ import os
5
+
6
+
7
+ ROOT = os.path.dirname(os.path.abspath(__file__))
8
+ HUMAN_EVAL = os.path.join(ROOT, "..", "data", "HumanEval.jsonl.gz")
9
+
10
+
11
+ def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]:
12
+ return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
13
+
14
+
15
+ def stream_jsonl(filename: str) -> Iterable[Dict]:
16
+ """
17
+ Parses each jsonl line and yields it as a dictionary
18
+ """
19
+ if filename.endswith(".gz"):
20
+ with open(filename, "rb") as gzfp:
21
+ with gzip.open(gzfp, 'rt') as fp:
22
+ for line in fp:
23
+ if any(not x.isspace() for x in line):
24
+ yield json.loads(line)
25
+ else:
26
+ with open(filename, "r", encoding="utf-8") as fp:
27
+ for line in fp:
28
+ if any(not x.isspace() for x in line):
29
+ yield json.loads(line)
30
+
31
+
32
+ def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
33
+ """
34
+ Writes an iterable of dictionaries to jsonl
35
+ """
36
+ if append:
37
+ mode = 'ab'
38
+ else:
39
+ mode = 'wb'
40
+ filename = os.path.expanduser(filename)
41
+ if filename.endswith(".gz"):
42
+ with open(filename, mode) as fp:
43
+ with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp:
44
+ for x in data:
45
+ gzfp.write((json.dumps(x) + "\n").encode('utf-8'))
46
+ else:
47
+ with open(filename, mode) as fp:
48
+ for x in data:
49
+ fp.write((json.dumps(x) + "\n").encode('utf-8'))
DeepSeek-Coder-main/Evaluation/HumanEval/human_eval/evaluate_functional_correctness.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fire
2
+ import sys
3
+
4
+ from .data import HUMAN_EVAL
5
+ from .evaluation import evaluate_functional_correctness
6
+
7
+
8
+ def entry_point(
9
+ sample_file: str,
10
+ k: str = "1,10,100",
11
+ n_workers: int = 4,
12
+ timeout: float = 3.0,
13
+ problem_file: str = "",
14
+ is_mbpp: bool = False,
15
+ ):
16
+ """
17
+ Evaluates the functional correctness of generated samples, and writes
18
+ results to f"{sample_file}_results.jsonl.gz"
19
+ """
20
+ k = list(map(int, k.split(",")))
21
+ results = evaluate_functional_correctness(sample_file, k, n_workers, timeout, problem_file, is_mbpp)
22
+ print(results)
23
+
24
+
25
+ def main():
26
+ fire.Fire(entry_point)
27
+
28
+
29
+ sys.exit(main())
DeepSeek-Coder-main/Evaluation/HumanEval/human_eval/evaluation.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import fire
4
+ import json
5
+ import gzip
6
+ import regex
7
+ import numpy as np
8
+ import itertools
9
+
10
+ from typing import *
11
+ from tqdm.auto import tqdm
12
+ from collections import defaultdict
13
+ from concurrent.futures import ThreadPoolExecutor, as_completed
14
+ from .data import stream_jsonl
15
+ from .execution import check_correctness
16
+ IMPORT_HELPER = {
17
+ "python": [
18
+ "import math",
19
+ "import re",
20
+ "import sys",
21
+ "import copy",
22
+ "import datetime",
23
+ "import itertools",
24
+ "import collections",
25
+ "import heapq",
26
+ "import functools",
27
+ "import hashlib",
28
+ "import numpy",
29
+ "import numpy as np",
30
+ "import string",
31
+ "from typing import *",
32
+ "from collections import *",
33
+ ],
34
+ "go" : [
35
+ "math",
36
+ "strings",
37
+ "fmt",
38
+ "strconv",
39
+ "time",
40
+ "bytes",
41
+ "regexp",
42
+ "sort",
43
+ "math/rand",
44
+ "crypto/md5",
45
+ ],
46
+ "cpp" : [
47
+ "#include<stdlib.h>",
48
+ "#include<algorithm>",
49
+ "#include<math.h>",
50
+ "#include<stdio.h>",
51
+ "#include<vector>",
52
+ "#include<string>",
53
+ "#include<climits>",
54
+ "#include<cstring>",
55
+ "#include<iostream>",
56
+ "#include<cassert>"
57
+ ],
58
+ "cs": ["using System.Numerics;", "using System.Diagnostics;", "using System.Collections.Generic;", "using System.Linq;", "using System.Text;", "using System.Security.Cryptography;", "using System.Collections.Generic;"]
59
+ }
60
+
61
+
62
+ LANGUAGE_NAME = {
63
+ "cpp" : "CPP",
64
+ "go" : "Go",
65
+ "java" : "Java",
66
+ "js" : "JavaScript",
67
+ "python": "Python",
68
+ }
69
+
70
+
71
+ def read_dataset(
72
+ data_file: str = None,
73
+ dataset_type: str = "humaneval",
74
+ num_shot=None,
75
+ ) -> Dict:
76
+ """
77
+ Reads a dataset and returns a dictionary of tasks.
78
+ """
79
+ if num_shot is not None:
80
+ print(f"{num_shot}-shot setting...")
81
+ if "humaneval" in dataset_type.lower():
82
+ if data_file is None:
83
+ current_path = os.path.dirname(os.path.abspath(__file__))
84
+ data_file = os.path.join(current_path, "..", "humaneval-x", "python", "data", "humaneval_python.jsonl.gz")
85
+ dataset = {task["task_id"]: task for task in stream_jsonl(data_file)}
86
+ else:
87
+ raise f"Dataset: {dataset_type} not supported."
88
+
89
+ return dataset
90
+
91
+ def estimate_pass_at_k(
92
+ num_samples: Union[int, List[int], np.ndarray],
93
+ num_correct: Union[List[int], np.ndarray],
94
+ k: int
95
+ ) -> np.ndarray:
96
+ """
97
+ Estimates pass@k of each problem and returns them in an array.
98
+ """
99
+
100
+ def estimator(n: int, c: int, k: int) -> float:
101
+ """
102
+ Calculates 1 - comb(n - c, k) / comb(n, k).
103
+ """
104
+ if n - c < k:
105
+ return 1.0
106
+ return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
107
+
108
+ if isinstance(num_samples, int):
109
+ num_samples_it = itertools.repeat(num_samples, len(num_correct))
110
+ else:
111
+ assert len(num_samples) == len(num_correct)
112
+ num_samples_it = iter(num_samples)
113
+
114
+ return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
115
+
116
+ def process_humaneval_test(sample, problems, example_test=False, is_mbpp=False, language="python"):
117
+ """
118
+ Processes a sample for evaluation.
119
+ """
120
+ task_id = sample["task_id"]
121
+ if is_mbpp:
122
+ return sample["generation"] + "\n" + "\n".join(problems[task_id]["test"])
123
+
124
+ prompt = sample["prompt"]
125
+ if example_test and "example_test" in problems[task_id] and problems[task_id]["example_test"] != "":
126
+ test = problems[task_id]["example_test"]
127
+ else:
128
+ test = problems[task_id]["test"]
129
+ code = sample["generation"]
130
+
131
+ # Pre-process for different languages
132
+ if language == "python":
133
+ test_setup = "\n".join(IMPORT_HELPER["python"]) + "\n"
134
+ test_string = test_setup + code + "\n" + test + "\n"
135
+ elif language == "cpp":
136
+ test_set_up = ""
137
+ for s in IMPORT_HELPER["cpp"]:
138
+ if s not in prompt:
139
+ test_set_up += s + "\n"
140
+ test_string = test_set_up + "\n" + code + "\n" + test
141
+ elif language == "java":
142
+ test_string = code + "\n" + test
143
+ elif language == "cs":
144
+ test_set_up = ""
145
+ for s in IMPORT_HELPER["cs"]:
146
+ test_set_up += s + "\n"
147
+ test_string = test_set_up + "\n" + code + "\n" + test
148
+ elif language in ["js", "javascript", "ts", "sh", "go"]:
149
+ test_string = code + "\n" + test
150
+ elif language == "go232":
151
+ import_string = problems[task_id]["import"]
152
+ prompt = prompt.replace(import_string, "")
153
+ if example_test and "example_test" in problems[task_id]:
154
+ test = problems[task_id]["example_test"]
155
+ else:
156
+ test = problems[task_id]["test"]
157
+ test_setup = problems[task_id]["test_setup"]
158
+ other_pkgs = []
159
+ for pkg in IMPORT_HELPER["go"]:
160
+ if pkg not in test_setup:
161
+ p = pkg.split("/")[-1]
162
+ if p + "." in code:
163
+ other_pkgs.append(f"\"{pkg}\"")
164
+ if other_pkgs:
165
+ import_other_pkgs = "import (\n" + " ".join([p + "\n" for p in other_pkgs]) + ")"
166
+ test_string = test_setup + "\n" + import_other_pkgs + "\n" + prompt + code + "\n" + test
167
+ else:
168
+ test_string = test_setup + "\n" + prompt + code + "\n" + test
169
+ elif language == "rust":
170
+ main = "\nfn main(){ \n } \n"
171
+ declaration = problems[task_id]["declaration"]
172
+ test_string = main + declaration + prompt + code + test
173
+ elif language == "php":
174
+ if code[:5] != "<?php":
175
+ code = "<?php\n" + code
176
+ test_string = code + "\n" + test + "?>"
177
+ return test_string
178
+
179
+
180
+ def stream_jsonl_all(filename: str) -> Iterable[Dict]:
181
+ """
182
+ Streams a JSONL file.
183
+ """
184
+ results = []
185
+ if filename.endswith(".gz"):
186
+ fp = gzip.open(open(filename, "rb"), "rt")
187
+ else:
188
+ fp = open(filename, "r")
189
+ for line in fp:
190
+ if any(not x.isspace() for x in line):
191
+ results.append(json.loads(line))
192
+ fp.close()
193
+
194
+ return results
195
+
196
+
197
+ def evaluate_functional_correctness(
198
+ input_file: str = None,
199
+ tmp_dir: str = "./",
200
+ n_workers: int = 32,
201
+ timeout: float = 10.0,
202
+ problem_file: str = "../data/humaneval_python.jsonl.gz",
203
+ out_dir: str = None,
204
+ k: List[int] = [1, 10, 100],
205
+ test_groundtruth: bool = False,
206
+ example_test: bool = False,
207
+ is_mbpp: bool = False,
208
+ language: str = "python",
209
+ ):
210
+ """
211
+ Evaluates the functional correctness of a model.
212
+ """
213
+ if example_test:
214
+ print("Example test...")
215
+
216
+ problems = read_dataset(problem_file,
217
+ dataset_type="humaneval")
218
+ sample_jsonl = stream_jsonl_all(input_file)
219
+
220
+
221
+ with ThreadPoolExecutor(max_workers=n_workers) as executor:
222
+
223
+ futures = []
224
+ completion_id = Counter()
225
+ n_samples = 0
226
+ results = defaultdict(list)
227
+
228
+ if test_groundtruth:
229
+ print("Testing ground truth...")
230
+ for sample in tqdm(problems.values()):
231
+ task_id = sample["task_id"]
232
+ lang = task_id.split("/")[0].lower()
233
+ if lang == "javascript":
234
+ lang = "js"
235
+ tmp_dir_ = os.path.join(tmp_dir, lang, "evaluation")
236
+ sample["generation"] = sample["canonical_solution"]
237
+ sample["test_code"] = process_humaneval_test(sample, problems, example_test, language)
238
+ if sample["test_code"] is None:
239
+ continue
240
+ args = (task_id, sample, lang, timeout, tmp_dir_, completion_id[task_id])
241
+ future = executor.submit(check_correctness, *args)
242
+ futures.append(future)
243
+ completion_id[task_id] += 1
244
+ n_samples += 1
245
+ else:
246
+ print("Reading samples...")
247
+ for sample in tqdm(sample_jsonl):
248
+ task_id = sample["task_id"]
249
+ if not is_mbpp:
250
+ lang = language
251
+ if not is_mbpp and lang == "javascript":
252
+ lang = "js"
253
+ if is_mbpp:
254
+ lang = "python"
255
+ tmp_dir_ = os.path.join(tmp_dir, lang, "evaluation")
256
+ sample["task_id"] = task_id
257
+ sample["test_code"] = process_humaneval_test(sample, problems, example_test, is_mbpp, language)
258
+ if sample["test_code"] is None:
259
+ continue
260
+ if "completion_id" in sample:
261
+ completion_id_ = sample["completion_id"]
262
+ else:
263
+ completion_id_ = completion_id[task_id]
264
+ args = (task_id, sample, lang, timeout, tmp_dir_, completion_id_)
265
+ future = executor.submit(check_correctness, *args)
266
+ futures.append(future)
267
+ completion_id[task_id] += 1
268
+ n_samples += 1
269
+
270
+ if len(completion_id) == len(problems):
271
+ evaluate_pass_at_k = True
272
+ else:
273
+ evaluate_pass_at_k = False
274
+
275
+ print("Running test suites...")
276
+ for future in tqdm(as_completed(futures), total=len(futures)):
277
+ result = future.result()
278
+ results[result["task_id"]].append((result["completion_id"], result))
279
+
280
+ # Calculate pass@k.
281
+ total, correct = [], []
282
+ for result in results.values():
283
+ passed = [r[1]["passed"] for r in result]
284
+ total.append(len(passed))
285
+ correct.append(sum(passed))
286
+ total = np.array(total)
287
+ correct = np.array(correct)
288
+ if evaluate_pass_at_k:
289
+ ks = k
290
+ pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
291
+ for k in ks if (total >= k).all()}
292
+ print(pass_at_k)
293
+ else:
294
+ print("Total:", np.sum(total))
295
+ print("Correct:", np.sum(correct))
296
+ return pass_at_k
DeepSeek-Coder-main/Evaluation/HumanEval/human_eval/execution.py ADDED
@@ -0,0 +1,731 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import contextlib
2
+ import faulthandler
3
+ import io
4
+ import multiprocessing
5
+ import os
6
+ import platform
7
+ import signal
8
+ import random
9
+ import subprocess
10
+ import tempfile
11
+ import gzip
12
+ import json
13
+ from typing import *
14
+ import traceback
15
+
16
+ java_exec = ""
17
+ node_exec = ""
18
+ tsc_exec = ""
19
+ go_exec = ""
20
+ php_exec = ""
21
+ cs_exec = ""
22
+
23
+ def check_correctness(
24
+ task_id: str,
25
+ sample: dict,
26
+ language_type: str,
27
+ timeout: float = 3.0,
28
+ tmp_dir: str = None,
29
+ completion_id: Optional[int] = None,
30
+ ) -> Dict:
31
+ """
32
+ Evaluates the functional correctness of a completion by running the test
33
+ suite provided in the problem.
34
+ """
35
+
36
+ def unsafe_execute(tmp_dir):
37
+ random_id = random.randint(1, 100000)
38
+ if "python" in language_type.lower():
39
+ with create_tempdir():
40
+
41
+ # These system calls are needed when cleaning up tempdir.
42
+ import os
43
+ import shutil
44
+ rmtree = shutil.rmtree
45
+ rmdir = os.rmdir
46
+ chdir = os.chdir
47
+
48
+ # Disable functionalities that can make destructive changes to the test.
49
+ reliability_guard()
50
+
51
+ try:
52
+ exec_globals = {}
53
+ with swallow_io():
54
+ with time_limit(timeout):
55
+ # WARNING
56
+ # This program exists to execute untrusted model-generated code. Although
57
+ # it is highly unlikely that model-generated code will do something overtly
58
+ # malicious in response to this test suite, model-generated code may act
59
+ # destructively due to a lack of model capability or alignment.
60
+ # Users are strongly encouraged to sandbox this evaluation suite so that it
61
+ # does not perform destructive actions on their host or network.
62
+ # Once you have read this disclaimer and taken appropriate precautions,
63
+ # uncomment the following line and proceed at your own risk:
64
+ exec(sample["test_code"], exec_globals)
65
+ result.append("passed")
66
+ except TimeoutException:
67
+ result.append("timed out")
68
+ except AssertionError as e:
69
+ result.append(f"failed: AssertionError")
70
+ except BaseException as e:
71
+ result.append(f"failed: {e}")
72
+ #print(sample["test_code"])
73
+ #print(result)
74
+ # Needed for cleaning up.
75
+ shutil.rmtree = rmtree
76
+ os.rmdir = rmdir
77
+ os.chdir = chdir
78
+
79
+ elif "go" in language_type.lower():
80
+ assert tmp_dir is not None, "Go should be evaluated in a dir where necessary module files installed."
81
+
82
+ import os
83
+ import shutil
84
+
85
+ if "tmp" not in tmp_dir:
86
+ tmp_dir = os.path.join(tmp_dir, "tmp")
87
+ tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
88
+ if not os.path.exists(tmp_dir):
89
+ os.makedirs(tmp_dir)
90
+ origin_path = os.getcwd()
91
+ os.chdir(tmp_dir)
92
+ open(f"main_test.go", 'w').write(sample["test_code"])
93
+ try:
94
+ exec_result = None
95
+ with time_limit(timeout):
96
+ # WARNING
97
+ # This program exists to execute untrusted model-generated code. Although
98
+ # it is highly unlikely that model-generated code will do something overtly
99
+ # malicious in response to this test suite, model-generated code may act
100
+ # destructively due to a lack of model capability or alignment.
101
+ # Users are strongly encouraged to sandbox this evaluation suite so that it
102
+ # does not perform destructive actions on their host or network.
103
+ # Once you have read this disclaimer and taken appropriate precautions,
104
+ # uncomment the following line and proceed at your own risk:
105
+ exec_result = subprocess.run([f"{go_exec}go", "test", f"-timeout={timeout}s", "main_test.go"], timeout=timeout, capture_output=True)
106
+
107
+ if exec_result.returncode == 0:
108
+ result.append("passed")
109
+ else:
110
+ if exec_result.stderr:
111
+ try:
112
+ err = exec_result.stderr.decode()
113
+ except:
114
+ err = exec_result.stderr
115
+ else:
116
+ try:
117
+ err = exec_result.stdout.decode()
118
+ except:
119
+ err = exec_result.stdout
120
+ result.append(f"failed: {err}")
121
+
122
+ except TimeoutException:
123
+ result.append("timed out")
124
+ os.chdir(origin_path)
125
+ shutil.rmtree(tmp_dir)
126
+ elif "js" in language_type.lower():
127
+ import os
128
+ import shutil
129
+
130
+ if "tmp" not in tmp_dir:
131
+ tmp_dir = os.path.join(tmp_dir, "tmp")
132
+ tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
133
+ if not os.path.exists(tmp_dir):
134
+ os.makedirs(tmp_dir)
135
+ origin_path = os.getcwd()
136
+ os.chdir(tmp_dir)
137
+ open(f"test.js", 'w').write(sample["test_code"])
138
+ try:
139
+ exec_result = None
140
+ with time_limit(timeout):
141
+ # WARNING
142
+ # This program exists to execute untrusted model-generated code. Although
143
+ # it is highly unlikely that model-generated code will do something overtly
144
+ # malicious in response to this test suite, model-generated code may act
145
+ # destructively due to a lack of model capability or alignment.
146
+ # Users are strongly encouraged to sandbox this evaluation suite so that it
147
+ # does not perform destructive actions on their host or network.
148
+ # Once you have read this disclaimer and taken appropriate precautions,
149
+ # uncomment the following line and proceed at your own risk:
150
+ exec_result = subprocess.run([f"{node_exec}node", "test.js"], timeout=timeout, capture_output=True)
151
+
152
+ if exec_result.stderr.decode():
153
+ err = exec_result.stderr.decode()
154
+ result.append(f"failed: {err}")
155
+ elif exec_result.stdout.decode():
156
+ err = exec_result.stdout.decode()
157
+ result.append(f"failed: {err}")
158
+ else:
159
+ result.append("passed")
160
+
161
+ except TimeoutException:
162
+ result.append("timed out")
163
+ os.chdir(origin_path)
164
+ shutil.rmtree(tmp_dir)
165
+ elif "cpp" in language_type.lower():
166
+ import os
167
+ import shutil
168
+ origin_path = os.getcwd()
169
+ if "tmp" not in tmp_dir:
170
+ tmp_dir = os.path.join(tmp_dir, "tmp")
171
+ tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
172
+ if not os.path.exists(tmp_dir):
173
+ os.makedirs(tmp_dir)
174
+
175
+ os.chdir(tmp_dir)
176
+ open(f"test.cpp", 'w').write(sample["test_code"])
177
+ if "162" in task_id:
178
+ compilation_result = subprocess.run(["/usr/bin/g++", "-std=c++17", "test.cpp", "-lcrypto", "-lssl"],
179
+ timeout=timeout,
180
+ capture_output=True)
181
+ else:
182
+ compilation_result = subprocess.run(["/usr/bin/g++", "-std=c++17", "test.cpp"], timeout=timeout,
183
+ capture_output=True)
184
+ if compilation_result.returncode != 0:
185
+ if compilation_result.stderr:
186
+ err = compilation_result.stderr.decode()
187
+ else:
188
+ err = compilation_result.stdout.decode()
189
+ result.append(f"failed: compilation error: {err}")
190
+ else:
191
+ try:
192
+ exec_result = None
193
+ with time_limit(timeout):
194
+ # WARNING
195
+ # This program exists to execute untrusted model-generated code. Although
196
+ # it is highly unlikely that model-generated code will do something overtly
197
+ # malicious in response to this test suite, model-generated code may act
198
+ # destructively due to a lack of model capability or alignment.
199
+ # Users are strongly encouraged to sandbox this evaluation suite so that it
200
+ # does not perform destructive actions on their host or network.
201
+ # Once you have read this disclaimer and taken appropriate precautions,
202
+ # uncomment the following line and proceed at your own risk:
203
+ exec_result = subprocess.run(["./a.out"], timeout=timeout, capture_output=True)
204
+
205
+ if exec_result.returncode == 0:
206
+ result.append("passed")
207
+ else:
208
+ if exec_result.stderr:
209
+ try:
210
+ err = exec_result.stderr.decode()
211
+ except:
212
+ err = exec_result.stderr
213
+ else:
214
+ try:
215
+ err = exec_result.stdout.decode()
216
+ except:
217
+ err = exec_result.stdout
218
+ result.append(f"failed: {err}")
219
+ except TimeoutException:
220
+ result.append("timed out")
221
+ #print(result[-1])
222
+ #print(sample["test_code"])
223
+ os.chdir(origin_path)
224
+ shutil.rmtree(tmp_dir)
225
+ elif "php" in language_type.lower():
226
+ import os
227
+ import shutil
228
+ origin_path = os.getcwd()
229
+ if "tmp" not in tmp_dir:
230
+ tmp_dir = os.path.join(tmp_dir, "tmp")
231
+ tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
232
+ if not os.path.exists(tmp_dir):
233
+ os.makedirs(tmp_dir)
234
+
235
+ os.chdir(tmp_dir)
236
+ open(f"test.php", 'w').write(sample["test_code"])
237
+ try:
238
+ exec_result = None
239
+ with time_limit(timeout):
240
+ cmd = f"{php_exec}php -f test.php"
241
+ exec_result = subprocess.run(cmd, timeout=timeout, capture_output=True, shell=True)
242
+
243
+ if exec_result.returncode == 0:
244
+ result.append("passed")
245
+ else:
246
+ if exec_result.stderr:
247
+ try:
248
+ err = exec_result.stderr.decode()
249
+ except:
250
+ err = exec_result.stderr
251
+ else:
252
+ try:
253
+ err = exec_result.stdout.decode()
254
+ except:
255
+ err = exec_result.stdout
256
+ result.append(f"failed: {err}")
257
+ except TimeoutException:
258
+ result.append("timed out")
259
+ print(result[-1])
260
+ print(sample["test_code"])
261
+ os.chdir(origin_path)
262
+ shutil.rmtree(tmp_dir)
263
+ elif "sh" in language_type.lower():
264
+ import os
265
+ import shutil
266
+ origin_path = os.getcwd()
267
+ if "tmp" not in tmp_dir:
268
+ tmp_dir = os.path.join(tmp_dir, "tmp")
269
+ tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
270
+ if not os.path.exists(tmp_dir):
271
+ os.makedirs(tmp_dir)
272
+
273
+ os.chdir(tmp_dir)
274
+ open(f"test.sh", 'w').write(sample["test_code"])
275
+ try:
276
+ exec_result = None
277
+ with time_limit(timeout):
278
+ cmd = "/bin/bash test.sh"
279
+ exec_result = subprocess.run(cmd, timeout=10, capture_output=True, shell=True)
280
+
281
+ if exec_result.returncode == 0:
282
+ result.append("passed")
283
+ else:
284
+ if exec_result.stderr:
285
+ try:
286
+ err = exec_result.stderr.decode()
287
+ except:
288
+ err = exec_result.stderr
289
+ else:
290
+ try:
291
+ err = exec_result.stdout.decode()
292
+ except:
293
+ err = exec_result.stdout
294
+ result.append(f"failed: {err}")
295
+ except TimeoutException:
296
+ result.append("timed out")
297
+ #print(result[-1])
298
+ #print(sample["test_code"])
299
+ os.chdir(origin_path)
300
+ shutil.rmtree(tmp_dir)
301
+ elif "ts" in language_type.lower():
302
+ import os
303
+ import shutil
304
+ origin_path = os.getcwd()
305
+ if "tmp" not in tmp_dir:
306
+ tmp_dir = os.path.join(tmp_dir, "tmp")
307
+ tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
308
+ if not os.path.exists(tmp_dir):
309
+ os.makedirs(tmp_dir)
310
+
311
+ os.chdir(tmp_dir)
312
+ env = {"PATH": f"{node_exec}:" + subprocess.os.environ["PATH"]}
313
+ open(f"test.ts", 'w').write(sample["test_code"])
314
+ cmd = f"{tsc_exec}tsc test.ts --target ES2015 --lib ES2015,DOM"
315
+ compilation_result = subprocess.run(cmd, timeout=timeout, capture_output=True, env=env, shell=True)
316
+ if compilation_result.returncode != 0:
317
+ if compilation_result.stderr:
318
+ err = compilation_result.stderr.decode()
319
+ else:
320
+ err = compilation_result.stdout.decode()
321
+ result.append(f"failed: compilation error: {err}")
322
+ else:
323
+ try:
324
+ exec_result = None
325
+ with time_limit(timeout):
326
+ exec_result = subprocess.run([f"{node_exec}node", "test.js"], timeout=timeout, capture_output=True)
327
+
328
+ if exec_result.returncode == 0:
329
+ result.append("passed")
330
+ else:
331
+ if exec_result.stderr:
332
+ try:
333
+ err = exec_result.stderr.decode()
334
+ except:
335
+ err = exec_result.stderr
336
+ else:
337
+ try:
338
+ err = exec_result.stdout.decode()
339
+ except:
340
+ err = exec_result.stdout
341
+ result.append(f"failed: {err}")
342
+ except TimeoutException:
343
+ result.append("timed out")
344
+ if result[-1] != "passed":
345
+ env = {"PATH": f"{node_exec}:" + subprocess.os.environ["PATH"]}
346
+ cmd = f"{tsc_exec}tsc test.ts"
347
+ compilation_result = subprocess.run(cmd, timeout=timeout, capture_output=True, env=env, shell=True)
348
+ if compilation_result.returncode != 0:
349
+ if compilation_result.stderr:
350
+ err = compilation_result.stderr.decode()
351
+ else:
352
+ err = compilation_result.stdout.decode()
353
+ result[-1] = f"failed: compilation error: {err}"
354
+ else:
355
+ try:
356
+ exec_result = None
357
+ with time_limit(timeout):
358
+ exec_result = subprocess.run([f"{node_exec}node", "test.js"], timeout=timeout, capture_output=True)
359
+
360
+ if exec_result.returncode == 0:
361
+ result[-1] = "passed"
362
+ else:
363
+ if exec_result.stderr:
364
+ try:
365
+ err = exec_result.stderr.decode()
366
+ except:
367
+ err = exec_result.stderr
368
+ else:
369
+ try:
370
+ err = exec_result.stdout.decode()
371
+ except:
372
+ err = exec_result.stdout
373
+ result[-1] = f"failed: {err}"
374
+ except TimeoutException:
375
+ result[-1] = "timed out"
376
+
377
+ os.chdir(origin_path)
378
+ shutil.rmtree(tmp_dir)
379
+ elif "cs" in language_type.lower():
380
+ import os
381
+ import shutil
382
+ origin_path = os.getcwd()
383
+ if "tmp" not in tmp_dir:
384
+ tmp_dir = os.path.join(tmp_dir, "tmp")
385
+ tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
386
+ if not os.path.exists(tmp_dir):
387
+ os.makedirs(tmp_dir)
388
+ os.chdir(tmp_dir)
389
+ open(f"Program.cs", 'w').write(sample["test_code"])
390
+ cmd = f"{cs_exec}mcs -d:DEBUG Program.cs"
391
+ compilation_result = subprocess.run(cmd, shell=True, capture_output=True)
392
+ if compilation_result.returncode != 0:
393
+ if compilation_result.stderr:
394
+ err = compilation_result.stderr.decode()
395
+ else:
396
+ err = compilation_result.stdout.decode()
397
+ result.append(f"failed: compilation error: {err}")
398
+ else:
399
+ try:
400
+ exec_result = None
401
+ cmd = f"{cs_exec}mono Program.exe"
402
+ env = dict(MONO_TRACE_LISTENER="Console.Error")
403
+ with time_limit(timeout):
404
+ exec_result = subprocess.run(cmd, timeout=timeout, shell=True, capture_output=True, env=env)
405
+
406
+ if "Fail" not in exec_result.stderr.decode():
407
+ result.append("passed")
408
+ else:
409
+ if exec_result.stderr:
410
+ try:
411
+ err = exec_result.stderr.decode()
412
+ except:
413
+ err = exec_result.stderr
414
+ else:
415
+ try:
416
+ err = exec_result.stdout.decode()
417
+ except:
418
+ err = exec_result.stdout
419
+ result.append(f"failed: {err}")
420
+ except TimeoutException:
421
+ result.append("timed out")
422
+ except Exception as e:
423
+ result.append(f"failed: {e}")
424
+ os.chdir(origin_path)
425
+ shutil.rmtree(tmp_dir)
426
+ elif "rust" in language_type.lower():
427
+ import os
428
+
429
+ WD: str = os.path.dirname(os.path.abspath(__file__))
430
+ RUST_DIR: str = os.path.join(WD, "rust")
431
+ RUST_SRC: str = os.path.join(RUST_DIR, "src")
432
+ RUST_BIN: str = os.path.join(RUST_SRC, "bin")
433
+ RUST_TMP_DIR: str = os.path.join(RUST_DIR, "tmp")
434
+ RUST_LOGS: str = os.path.join(RUST_TMP_DIR, "logs")
435
+ RUST_EXT: str = ".rs"
436
+
437
+ # Create mandatory tmp directories
438
+ os.makedirs(RUST_TMP_DIR, exist_ok=True)
439
+ os.makedirs(RUST_LOGS, exist_ok=True)
440
+ os.makedirs(RUST_SRC, exist_ok=True)
441
+ os.makedirs(RUST_BIN, exist_ok=True)
442
+
443
+ with tempfile.NamedTemporaryFile(dir = RUST_BIN, delete=False) as f:
444
+ #temporal file name
445
+ file_prefix = sample["task_id"].lower().replace("/", "_")
446
+ file_name:str = file_prefix +RUST_EXT
447
+
448
+ os.rename(f.name, os.path.join(RUST_BIN, file_name))
449
+
450
+ # Sample to pure Rust function
451
+ rust_code: str = sample["test_code"]
452
+
453
+ # dump the rust source code in the target temporal file
454
+ f.write(rust_code.encode('utf-8'))
455
+
456
+ # Proceed towards Rust binaries compilation. Therefore move to Rust module root dir.
457
+ os.chdir(RUST_DIR)
458
+
459
+ # Two possible outcomes
460
+ # Pass OR Fail compilation
461
+ log_filename: str = file_prefix + ".jsonl"
462
+ log_path: str = os.path.join(RUST_LOGS, log_filename)
463
+ cargo_check: str = "cargo check --bin " + file_prefix + " --message-format json >> " + log_path
464
+ # Compilation build status
465
+ returned_val_compilation: int
466
+
467
+ # Overwrite file content
468
+ if os.path.exists(log_path):
469
+ if(file_size := os.path.getsize(log_path)) >= 0:
470
+ os.remove(log_path)
471
+ returned_val_compilation = os.system(cargo_check)
472
+
473
+ else:
474
+ returned_val_compilation = os.system(cargo_check)
475
+
476
+ # 0 means success
477
+ if returned_val_compilation == 0:
478
+
479
+ #Execution pipeline
480
+ cargo_test: str = "cargo test --bin " +file_prefix+ " --message-format json >> " + log_path
481
+ returned_val_execution = os.system(cargo_test)
482
+
483
+ if returned_val_execution == 0:
484
+ result.append("passed")
485
+ else:
486
+ result.append(f"failed: execution error")
487
+
488
+ else:
489
+ result.append(f"failed: compilation error")
490
+
491
+
492
+ elif "java" in language_type.lower():
493
+ assert tmp_dir is not None, "Java should be evaluated in a temporary dir."
494
+
495
+ import os
496
+ import shutil
497
+
498
+ if "tmp" not in tmp_dir:
499
+ tmp_dir = os.path.join(tmp_dir, "tmp")
500
+ tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
501
+ if not os.path.exists(tmp_dir):
502
+ os.makedirs(tmp_dir)
503
+ open(os.path.join(tmp_dir, "Problem.java"), 'w').write(sample["test_code"])
504
+ origin_path = os.getcwd()
505
+ os.system(f"cp ./javatuples-1.2.jar {tmp_dir}/")
506
+ os.chdir(tmp_dir)
507
+ res = "failed: unknown error"
508
+ compile_returncode = -1
509
+ for _ in range(5):
510
+ try:
511
+ cmd = f"{java_exec}javac -cp javatuples-1.2.jar Problem.java"
512
+ compilation_result = subprocess.run(cmd, timeout=60, capture_output=True, shell=True)
513
+ compile_returncode = compilation_result.returncode
514
+ break
515
+ except subprocess.TimeoutExpired as e:
516
+ continue
517
+ if compile_returncode != 0:
518
+ res = "failed: compilation error"
519
+ else:
520
+ exec_result = None
521
+ try:
522
+ # WARNING
523
+ # This program exists to execute untrusted model-generated code. Although
524
+ # it is highly unlikely that model-generated code will do something overtly
525
+ # malicious in response to this test suite, model-generated code may act
526
+ # destructively due to a lack of model capability or alignment.
527
+ # Users are strongly encouraged to sandbox this evaluation suite so that it
528
+ # does not perform destructive actions on their host or network.
529
+ # Once you have read this disclaimer and taken appropriate precautions,
530
+ # uncomment the following line and proceed at your own risk:
531
+ cmd = f"{java_exec}java -ea -cp .:javatuples-1.2.jar Problem"
532
+ exec_result = subprocess.run(cmd, timeout=timeout, capture_output=True, shell=True)
533
+ if exec_result.returncode == 0:
534
+ res = "passed"
535
+ elif exec_result.returncode == 1:
536
+ if "AssertionError" in exec_result.stderr.decode('unicode-escape'):
537
+ res = "failed: wrong answer"
538
+ else:
539
+ res = f"failed: {exec_result.stderr.decode()}"
540
+ except subprocess.TimeoutExpired as e:
541
+ res = "time out"
542
+ except BaseException as e:
543
+ res = f"failed: {e}"
544
+
545
+ result.append(res)
546
+ os.chdir(origin_path)
547
+ shutil.rmtree(tmp_dir)
548
+
549
+ manager = multiprocessing.Manager()
550
+ result = manager.list()
551
+
552
+ p = multiprocessing.Process(target=unsafe_execute, args=(tmp_dir,))
553
+ p.start()
554
+ p.join(timeout=timeout + 1)
555
+ if p.is_alive():
556
+ p.kill()
557
+
558
+ if not result:
559
+ result.append("timed out")
560
+
561
+ return {
562
+ "task_id" : task_id,
563
+ "completion_id": completion_id,
564
+ "result" : result[0],
565
+ "passed" : result[0] == "passed",
566
+ "finish" : -1 if "finish" not in sample else sample["finish"],
567
+ "code" : sample["test_code"]
568
+ }
569
+
570
+ # Copyright (c) OpenAI (https://openai.com)
571
+
572
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
573
+ # of this software and associated documentation files (the "Software"), to deal
574
+ # in the Software without restriction, including without limitation the rights
575
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
576
+ # copies of the Software, and to permit persons to whom the Software is
577
+ # furnished to do so, subject to the following conditions:
578
+
579
+ # The above copyright notice and this permission notice shall be included in
580
+ # all copies or substantial portions of the Software.
581
+
582
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
583
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
584
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
585
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
586
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
587
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
588
+ # THE SOFTWARE.
589
+ # ============================================================================
590
+ @contextlib.contextmanager
591
+ def time_limit(seconds: float):
592
+ def signal_handler(signum, frame):
593
+ raise TimeoutException("Timed out!")
594
+
595
+ signal.setitimer(signal.ITIMER_REAL, seconds)
596
+ signal.signal(signal.SIGALRM, signal_handler)
597
+ try:
598
+ yield
599
+ finally:
600
+ signal.setitimer(signal.ITIMER_REAL, 0)
601
+
602
+
603
+ @contextlib.contextmanager
604
+ def swallow_io():
605
+ stream = WriteOnlyStringIO()
606
+ with contextlib.redirect_stdout(stream):
607
+ with contextlib.redirect_stderr(stream):
608
+ with redirect_stdin(stream):
609
+ yield
610
+
611
+
612
+ @contextlib.contextmanager
613
+ def create_tempdir():
614
+ with tempfile.TemporaryDirectory() as dirname:
615
+ with chdir(dirname):
616
+ yield dirname
617
+
618
+
619
+ class TimeoutException(Exception):
620
+ pass
621
+
622
+
623
+ class WriteOnlyStringIO(io.StringIO):
624
+ """ StringIO that throws an exception when it's read from """
625
+
626
+ def read(self, *args, **kwargs):
627
+ raise IOError
628
+
629
+ def readline(self, *args, **kwargs):
630
+ raise IOError
631
+
632
+ def readlines(self, *args, **kwargs):
633
+ raise IOError
634
+
635
+ def readable(self, *args, **kwargs):
636
+ """ Returns True if the IO object can be read. """
637
+ return False
638
+
639
+
640
+ class redirect_stdin(contextlib._RedirectStream): # type: ignore
641
+ _stream = 'stdin'
642
+
643
+
644
+ @contextlib.contextmanager
645
+ def chdir(root):
646
+ if root == ".":
647
+ yield
648
+ return
649
+ cwd = os.getcwd()
650
+ os.chdir(root)
651
+ try:
652
+ yield
653
+ except BaseException as exc:
654
+ raise exc
655
+ finally:
656
+ os.chdir(cwd)
657
+
658
+
659
+ def reliability_guard(maximum_memory_bytes: Optional[int] = None):
660
+ """
661
+ This disables various destructive functions and prevents the generated code
662
+ from interfering with the test (e.g. fork bomb, killing other processes,
663
+ removing filesystem files, etc.)
664
+
665
+ WARNING
666
+ This function is NOT a security sandbox. Untrusted code, including, model-
667
+ generated code, should not be blindly executed outside of one. See the
668
+ Codex paper for more information about OpenAI's code sandbox, and proceed
669
+ with caution.
670
+ """
671
+
672
+ if maximum_memory_bytes is not None:
673
+ import resource
674
+ resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
675
+ resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
676
+ if not platform.uname().system == 'Darwin':
677
+ resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
678
+
679
+ faulthandler.disable()
680
+
681
+ import builtins
682
+ builtins.exit = None
683
+ builtins.quit = None
684
+
685
+ import os
686
+ os.environ['OMP_NUM_THREADS'] = '1'
687
+
688
+ os.kill = None
689
+ os.system = None
690
+ os.putenv = None
691
+ os.remove = None
692
+ os.removedirs = None
693
+ os.rmdir = None
694
+ os.fchdir = None
695
+ os.setuid = None
696
+ os.fork = None
697
+ os.forkpty = None
698
+ os.killpg = None
699
+ os.rename = None
700
+ os.renames = None
701
+ os.truncate = None
702
+ os.replace = None
703
+ os.unlink = None
704
+ os.fchmod = None
705
+ os.fchown = None
706
+ os.chmod = None
707
+ os.chown = None
708
+ os.chroot = None
709
+ os.fchdir = None
710
+ os.lchflags = None
711
+ os.lchmod = None
712
+ os.lchown = None
713
+ os.getcwd = None
714
+ os.chdir = None
715
+
716
+ import shutil
717
+ shutil.rmtree = None
718
+ shutil.move = None
719
+ shutil.chown = None
720
+
721
+ import subprocess
722
+ subprocess.Popen = None # type: ignore
723
+
724
+ __builtins__['help'] = None
725
+
726
+ import sys
727
+ sys.modules['ipdb'] = None
728
+ sys.modules['joblib'] = None
729
+ sys.modules['resource'] = None
730
+ sys.modules['psutil'] = None
731
+ sys.modules['tkinter'] = None
DeepSeek-Coder-main/Evaluation/HumanEval/humaneval.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import string
3
+ import multiprocessing
4
+ import os
5
+ import numpy as np
6
+ import json
7
+ import re
8
+ import torch
9
+ import datetime
10
+ import subprocess
11
+ import torch.distributed as dist
12
+ from attrdict import AttrDict
13
+ from human_eval.evaluation import evaluate_functional_correctness
14
+ from transformers import AutoTokenizer
15
+ from utils.dataset import HumanEvalDataset
16
+ from utils.utils import cleanup_code
17
+
18
+ class HumanEval:
19
+ """
20
+ HumanEval evaluation class.
21
+ """
22
+ def __init__(self, data_root, max_seq_len=2048,
23
+ language="python", max_gen_len=200, batch_size=512,
24
+ log_dir=None, temperature=0, issft=False, top_p=0.95,
25
+ model_name="", inference_increment=True,
26
+ tokenizer_cfg=None, n_sample=40, k_sample=1):
27
+ self.data_root = data_root
28
+ self.max_seq_len = max_seq_len
29
+ self.max_gen_len = max_gen_len
30
+ self.batch_size = batch_size
31
+ self.k = k_sample
32
+ self.n_sample = n_sample
33
+ self.language = language
34
+ self.log_dir = log_dir
35
+ self.sft = issft
36
+ self.temperature = temperature
37
+ self.top_p = top_p
38
+ self.model_name = tokenizer_cfg["model_path"].replace("/", "_")
39
+ self.inference_increment = inference_increment
40
+ os.makedirs(self.log_dir, exist_ok=True)
41
+ tokenizer_cls = tokenizer_cfg.pop('cls')
42
+ try:
43
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_cfg.pop("model_path"), trust_remote_code=True)
44
+ except Exception as e:
45
+ print(e)
46
+ assert False
47
+
48
+ @torch.no_grad()
49
+ def eval_model(self, gpt, accelerator):
50
+ """
51
+ Evaluate the model on HumanEval.
52
+ """
53
+ assert self.log_dir is not None, "log_dir should not be None when evaluating humaneval"
54
+ dataset = HumanEvalDataset(self.data_root, sample_num=self.n_sample, language=self.language, issft=self.sft)
55
+ nprompt = len(dataset) // self.n_sample
56
+ dp_rank = accelerator.process_index
57
+ dp_size = accelerator.num_processes
58
+ if self.k > 1:
59
+ assert self.n_sample >= 100, "HumanEval PASS@100 needs n_sample >= 100"
60
+ gpt.eval()
61
+ # each process will process a subset of the dataset
62
+ prompt_indices_split = np.array_split(range(nprompt), dp_size)
63
+ prompt_indices = prompt_indices_split[dp_rank]
64
+ indices = [x * self.n_sample + j for x in prompt_indices for j in range(self.n_sample)]
65
+ all_num = len(indices)
66
+ processed_num = 0
67
+ log_file = os.path.join(self.log_dir,
68
+ f'{self.model_name}_rank{dp_rank}_bs{self.batch_size}_shot_log_{self.language}.json')
69
+ tmpfile = open(log_file, "w")
70
+ start_time = time.time()
71
+ # split the dataset into batches and construct a list of inputs
72
+ for idx in range(0, len(indices), self.batch_size):
73
+ prompt_list = []
74
+ prompt_lens = []
75
+ orriginal_prompt_list = []
76
+ tokenized_prompt_lens = []
77
+ taskid = []
78
+ # get the prompts from the dataset
79
+ for j in indices[idx:idx + self.batch_size]:
80
+ data = dataset[j]
81
+ fprompt = data["prompt"].strip()
82
+ prompt_list.append(fprompt)
83
+ tmp = self.tokenizer.encode(fprompt)
84
+ orriginal_prompt_list.append(data["original_prompt"])
85
+ prompt_lens.append(len(fprompt))
86
+ tokenized_prompt_lens.append(tmp)
87
+ taskid.append(data["task_id"])
88
+ input_ids = torch.tensor(tokenized_prompt_lens).to(accelerator.device)
89
+ # generate the code
90
+ if self.temperature != 0:
91
+ decoded = gpt.generate(
92
+ input_ids=input_ids,
93
+ max_new_tokens=self.max_gen_len,
94
+ do_sample=True,
95
+ eos_token_id=self.tokenizer.eos_token_id,
96
+ temperature=self.temperature,
97
+ top_p=self.top_p,
98
+ pad_token_id=self.tokenizer.eos_token_id,
99
+ )
100
+ else:
101
+ decoded = gpt.generate(
102
+ input_ids=input_ids,
103
+ max_new_tokens=self.max_gen_len,
104
+ do_sample=False,
105
+ eos_token_id=self.tokenizer.eos_token_id,
106
+ pad_token_id=self.tokenizer.eos_token_id,
107
+ )
108
+ # save the results to a file
109
+ for local_idx, text in enumerate(decoded):
110
+ prediction = decoded[local_idx]
111
+ prediction = self.tokenizer.decode(prediction, skip_special_tokens=True)
112
+ suffixprediction = prediction[prompt_lens[local_idx]:]
113
+ suffixprediction = cleanup_code(suffixprediction, self.language, "humaneval", self.sft, dataset.stopwords)
114
+ # sft mode does not need original prompt
115
+ if not self.sft:
116
+ suffixprediction = orriginal_prompt_list[local_idx] + "\n" + suffixprediction
117
+ res = {"task_id": taskid[local_idx], "generation": suffixprediction, "prompt": orriginal_prompt_list[local_idx], "wholecode":prediction}
118
+ tmpfile.write(json.dumps(res) + "\n")
119
+ tmpfile.flush()
120
+ processed_num += 1
121
+ self.log_score(dp_rank, processed_num, all_num, start_time, self.batch_size)
122
+ tmpfile.close()
123
+ accelerator.wait_for_everyone()
124
+ # calculate the final score of pass@k
125
+ self._calculate_final_score(accelerator)
126
+ accelerator.wait_for_everyone()
127
+ return
128
+
129
+ def log_score(self, dp_rank, processed_num, all_num, start_time, bs):
130
+ """
131
+ Log the score.
132
+ """
133
+ mem = torch.cuda.max_memory_allocated() / (1 << 30)
134
+ avg_time = (time.time() - start_time) / processed_num * bs
135
+ print(
136
+ f'DP RANK:{dp_rank} process_num/all_num:{int(processed_num)}/{all_num} '
137
+ f'avg_time_per_batch:{avg_time:.2f} s '
138
+ f'still_need:{((all_num - processed_num) // bs + 1) * avg_time / 60:.2f} m',
139
+ f'mem:{mem:.3f} GiB bs:{bs}',
140
+ flush=True
141
+ )
142
+ if processed_num == all_num:
143
+ print(f'EVAL DONE! Process time {(time.time() - start_time) / 60:.2f} m', flush=True)
144
+
145
+ def _calculate_final_score(self, accelerator):
146
+ """
147
+ Calculate the final score.
148
+ """
149
+ if accelerator.is_local_main_process:
150
+ logfilepath = os.path.join(self.log_dir, f'final_{self.model_name}.jsonl')
151
+ logfile = open(logfilepath, "w")
152
+ for i in range(accelerator.num_processes):
153
+ tmplogfile = os.path.join(self.log_dir, f'{self.model_name}_rank{i}_bs{self.batch_size}_shot_log_{self.language}.json')
154
+ logfile.write(open(tmplogfile).read().strip() + "\n")
155
+ os.remove(tmplogfile)
156
+ logfile.close()
157
+ timeout = 10
158
+ runlang = self.language
159
+ res = evaluate_functional_correctness(input_file=logfilepath, problem_file=os.path.join(self.data_root, f"humaneval-{self.language}.jsonl"), tmp_dir=self.log_dir, timeout=timeout, language=runlang)
160
+ print("score is", res['pass@%d' % self.k])
161
+ os.remove(logfilepath)
162
+ return
163
+
DeepSeek-Coder-main/Evaluation/HumanEval/javatuples-1.2.jar ADDED
Binary file (65.5 kB). View file
 
DeepSeek-Coder-main/Evaluation/HumanEval/test_config.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ distributed_type: MULTI_GPU
3
+ downcast_bf16: 'no'
4
+ gpu_ids: all
5
+ machine_rank: 0
6
+ main_training_function: main
7
+ mixed_precision: 'no'
8
+ num_machines: 1
9
+ num_processes: 3
10
+ rdzv_backend: static
11
+ same_network: true
12
+ tpu_env: []
13
+ tpu_use_cluster: false
14
+ tpu_use_sudo: false
15
+ use_cpu: false
DeepSeek-Coder-main/Evaluation/HumanEval/utils/__pycache__/dataset.cpython-38.pyc ADDED
Binary file (2.48 kB). View file
 
DeepSeek-Coder-main/Evaluation/HumanEval/utils/__pycache__/instruct.cpython-38.pyc ADDED
Binary file (2.87 kB). View file
 
DeepSeek-Coder-main/Evaluation/HumanEval/utils/__pycache__/utils.cpython-38.pyc ADDED
Binary file (1.24 kB). View file
 
DeepSeek-Coder-main/Evaluation/HumanEval/utils/__pycache__/utils.cpython-39.pyc ADDED
Binary file (3.49 kB). View file
 
DeepSeek-Coder-main/Evaluation/HumanEval/utils/dataset.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import json
4
+
5
+ class HumanEvalDataset:
6
+
7
+ def __init__(self, root, sample_num=1, language="python", issft=False):
8
+ """
9
+ root: the path to the HumanEval dataset
10
+ sample_num: the number of samples for each prompt
11
+ language: the language of the HumanEval dataset
12
+ issft: whether to use the SFT setting
13
+ """
14
+ self.root = root
15
+ self.data = open(os.path.join(self.root, f"humaneval-{language}.jsonl")).readlines()
16
+
17
+ tmp = self.get_qa_only_data(self.data, issft)
18
+ self.clean_data = []
19
+ for i in range(len(tmp)):
20
+ for j in range(sample_num):
21
+ self.clean_data.append(tmp[i])
22
+ self.stopwords = self.clean_data[0]["stopwords"]
23
+ np.random.seed(1234)
24
+ print(f"Read HumanEval from {root}, number of samples {len(self.clean_data)}")
25
+
26
+ def get_qa_only_data(self, data_json, sft=False):
27
+ """
28
+ data_json: the jsonl file of HumanEval
29
+ sft: whether to use the SFT setting
30
+ return: a list of dict, each dict contains the prompt, task_id and stopwords
31
+ """
32
+ ans = []
33
+ for line in data_json:
34
+ line = json.loads(line)
35
+ prompt = line["prompt"].strip()
36
+ if "prefix" in line:
37
+ origin_prompt = line["prefix"]
38
+ else:
39
+ origin_prompt = line["prompt"]
40
+
41
+ if sft:
42
+ prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context.\nWrite a response that appropriately completes the request.\n\n### Instruction:\nWrite a program to perform the given task.\n\nInput:\n{prompt}\n\n### Response:\n"""
43
+ if "stop_tokens" in line:
44
+ s = line["stop_tokens"]
45
+ else:
46
+ s = []
47
+ ans.append({"prompt":prompt, "task_id":line["task_id"], "original_prompt": origin_prompt, "stopwords":s})
48
+ return ans
49
+
50
+ def __len__(self):
51
+ """
52
+ return the number of samples in the dataset
53
+ """
54
+ return len(self.clean_data)
55
+
56
+ def __getitem__(self, index):
57
+ """
58
+ return the sample at index
59
+ """
60
+ sample = self.clean_data[index]
61
+ return sample
DeepSeek-Coder-main/Evaluation/HumanEval/utils/utils.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ languge_settings = {
4
+ 'python': {
5
+ 'full_name': 'Python',
6
+ 'indent': 4,
7
+ },
8
+ 'cpp': {
9
+ 'full_name': 'cpp',
10
+ 'indent': 0,
11
+ 'main': "int main()",
12
+ },
13
+ 'java': {
14
+ 'full_name': 'Java',
15
+ 'indent': 4,
16
+ 'main': "public static void main",
17
+ },
18
+ 'cs': {
19
+ 'full_name': "csharp",
20
+ 'indent': 0,
21
+ 'main': "public static void Main",
22
+ },
23
+ 'php': {
24
+ 'full_name': "PHP",
25
+ 'indent': 0,
26
+ },
27
+ 'ts': {
28
+ 'full_name': "TypeScript",
29
+ 'indent': 0,
30
+ },
31
+ 'js': {
32
+ 'full_name': "JavaScript",
33
+ 'indent': 0
34
+ },
35
+ 'sh': {
36
+ 'full_name': "Bash",
37
+ 'indent': 0
38
+ }
39
+ }
40
+
41
+ def get_function_name(question: str, lang: str):
42
+ func_lines = [x for x in question.strip().split('\n') if x.strip()]
43
+
44
+ if lang.lower() == 'python':
45
+ func_idx = [i for i in range(len(func_lines)) if func_lines[i].startswith("def ")][-1]
46
+ func_name = func_lines[func_idx].split('(')[0].strip()
47
+ func_prefix = "\n".join(func_lines[:func_idx])
48
+ return func_name, func_prefix
49
+
50
+ func_name = func_lines[-1].split('{')[0].strip()
51
+ func_prefix = "\n".join(func_lines[:-1])
52
+ return func_name, func_prefix
53
+
54
+ def extract_generation_code(example: str, lang_code: str, verbose: bool=False):
55
+ task_id = example['task_id']
56
+ output = example.get('output', example.get("gpt_completion"))
57
+ question = example["prompt"].strip()
58
+ setting = languge_settings[lang_code]
59
+ lang = setting['full_name']
60
+ indent = setting['indent']
61
+
62
+ try:
63
+ code_block: str = re.findall(f'```{lang.lower()}\n(.*?)```', output, re.DOTALL | re.IGNORECASE)[0]
64
+ if verbose:
65
+ print(">>> Task: {}\n{}".format(task_id, code_block))
66
+
67
+ # Remove main
68
+ if setting.get('main', None) and setting['main'] in code_block:
69
+ main_start = code_block.index(setting['main'])
70
+ code_block = code_block[:main_start]
71
+
72
+ func_name, func_prefix = get_function_name(question, lang)
73
+
74
+ try:
75
+ start = code_block.lower().index(func_name.lower())
76
+ indent = 0
77
+ while start - indent >= 0 and code_block[start - indent-1] == ' ':
78
+ indent += 1
79
+
80
+ try:
81
+ end = code_block.rindex('\n' + ' '*indent + '}')
82
+ except:
83
+ end = len(code_block)
84
+ except:
85
+ start = 0
86
+ try:
87
+ end = code_block.rindex('\n' + ' '*indent + '}')
88
+ except:
89
+ end = len(code_block)
90
+
91
+ body = code_block[start:end]
92
+
93
+ if lang_code.lower() in ['php', 'ts', 'js']:
94
+ body += '\n' + ' '*indent + '}'
95
+
96
+ generation = func_prefix + '\n' + body + '\n'
97
+ example['generation'] = generation
98
+
99
+ except Exception as ex:
100
+ print("Failed to extract code block with error `{}`:\n>>> Task: {}\n>>> Output:\n{}".format(
101
+ ex, task_id, output
102
+ ))
103
+ example['generation'] = example['prompt'] + '\n' + output
104
+
105
+ return example
106
+
107
+ def cleanup_code(
108
+ code: str,
109
+ language_type: str = None,
110
+ dataset: str = None,
111
+ issft: bool = False,
112
+ stop_words = []
113
+ ):
114
+ """
115
+ Cleans up the generated code.
116
+ """
117
+
118
+ if language_type.lower() == "python":
119
+ if issft:
120
+ code = _clean_python_code_for_sft(code)
121
+ stop_words = ["\ndef", "\nclass", "\nif", "\n#", "\nprint"]
122
+ code = _truncate_code_at_stopwords(code, stop_words)
123
+ elif language_type.lower() == "ts":
124
+ code = _truncate_code_at_stopwords(code, stop_words + ["\nexport", "\nimport", "\nexport default", "\nimport default", "\nconsole.log"])
125
+ else:
126
+ code = _truncate_code_at_stopwords(code, stop_words)
127
+
128
+ return code
129
+
130
+ def _clean_python_code_for_sft(code):
131
+ code = code.replace("\r", "")
132
+ if "```python" in code:
133
+ code_start_idx = code.index("```python")
134
+ code = code[code_start_idx:].replace("```python", "").strip()
135
+ end_idx = code.find("```") if "```" in code else len(code)
136
+ code = code[:end_idx].strip()
137
+
138
+ return code
139
+
140
+ def _truncate_code_at_stopwords(code, stop_words):
141
+ min_stop_idx = len(code)
142
+ for stop_word in stop_words:
143
+ stop_index = code.find(stop_word)
144
+ if 0 <= stop_index < min_stop_idx:
145
+ min_stop_idx = stop_index
146
+ return code[:min_stop_idx]
DeepSeek-Coder-main/Evaluation/LeetCode/data/20240121-Jul-zh.jsonl ADDED
The diff for this file is too large to render. See raw diff