| import json | |
| INPUT_FILE = './output/linux_bugfix_dataset.jsonl' | |
| OUTPUT_FILE = '../dataset/training_data_prompt_completion.jsonl' | |
| def format_prompt(original_code, instruction): | |
| return ( | |
| "Given the following original C code:\n" | |
| f"{original_code.strip()}\n\n" | |
| "Instruction:\n" | |
| f"{instruction.strip()}\n\n" | |
| "Return the diff that fixes it:\n" | |
| ) | |
| def format_completion(diff_code): | |
| return diff_code.strip() | |
| def convert_dataset(input_path, output_path): | |
| with open(input_path, 'r', encoding='utf-8') as fin, \ | |
| open(output_path, 'w', encoding='utf-8') as fout: | |
| for line in fin: | |
| data = json.loads(line) | |
| original_code = data["input"]["original code"] | |
| instruction = data["input"]["instruction"] | |
| diff_code = data["output"]["diff codes"] | |
| prompt = format_prompt(original_code, instruction) | |
| completion = format_completion(diff_code) | |
| new_entry = { | |
| "prompt": prompt, | |
| "completion": completion | |
| } | |
| fout.write(json.dumps(new_entry, ensure_ascii=False) + '\n') | |
| print(f"[DONE] Converted dataset saved to: {output_path}") | |
| if __name__ == "__main__": | |
| convert_dataset(INPUT_FILE, OUTPUT_FILE) | |