Spaces:
Configuration error
Configuration error
| """ | |
| Convert alpaca dataset into sharegpt format. | |
| Usage: python convert_alpaca.py --in_file alpaca_data.json --out_file alpaca_data_sharegpt.json | |
| """ | |
| import argparse | |
| from datasets import load_dataset | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--in_file", type=str) | |
| parser.add_argument("--out_file", type=str) | |
| parser.add_argument("--data_type", type=str, default='alpaca') | |
| args = parser.parse_args() | |
| print(args) | |
| data_files = {"train": args.in_file} | |
| raw_datasets = load_dataset('json', data_files=data_files) | |
| ds = raw_datasets['train'] | |
| def process_alpaca(examples): | |
| convs = [] | |
| for instruction, inp, output in zip(examples['instruction'], examples['input'], examples['output']): | |
| if len(inp.strip()) > 1: | |
| instruction = instruction + '\n\n' + inp | |
| q = instruction | |
| a = output | |
| convs.append([ | |
| {"from": "human", "value": q}, | |
| {"from": "gpt", "value": a} | |
| ]) | |
| return {"conversations": convs} | |
| if args.data_type in ['alpaca']: | |
| ds = ds.map(process_alpaca, batched=True, remove_columns=ds.column_names, desc="Running process") | |
| else: | |
| # Other sharegpt dataset, need rename to conversations and remove unused columns | |
| if "items" in ds.column_names: | |
| ds = ds.rename(columns={"items": "conversations"}) | |
| columns_to_remove = ds.column_names.copy() | |
| columns_to_remove.remove('conversations') | |
| ds = ds.remove_columns(columns_to_remove) | |
| ds.to_json(f"{args.out_file}", lines=True, force_ascii=False) | |