from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, DataCollatorForLanguageModeling
from trl import DPOTrainer
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import DataCollatorForLanguageModeling

# note: to get bitsandbytes to work on windows, uninstall bitsandbytes and reinstall with
# pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl


def process_hh_rlhf_sample(sample):
    """
    sample is a dictionary with keys 'chosen' and 'rejected'.
    Extract the prompt and the two completions from the sample.
    Find the index of the last substring '\n\nAssistant:

    Return a dictionary with keys prompt, chosen, and rejected.
    """
    term = '\n\nAssistant: '
    end_of_prompt_index = sample['chosen'].rfind(term)

    # extract the prompt
    prompt = sample['chosen'][:end_of_prompt_index+len(term)]
    # extract the chosen completion
    chosen = sample['chosen'][len(prompt):]
    # extract the rejected completion
    rejected = sample['rejected'][len(prompt):]

    return {'prompt': prompt, 'chosen': chosen, 'rejected': rejected}

def get_anthropic_hh_rlhf_dataset(split='train'):
    """
    The Anthropic HH-RLHF dataset contains 160k training examples and 8k test examples.
    Each example is a dictionary with two keys: 'chosen' and 'rejected'.
    Each of these includes the prompt and the completion.
    I want to extract the prompt, chosen completion, and rejected completion.

    https://arxiv.org/abs/2204.05862
    https://huggingface.co/datasets/Anthropic/hh-rlhf
    """
    dataset = load_dataset('Anthropic/hh-rlhf', split=split)
    return dataset.map(process_hh_rlhf_sample)

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True
)

torch_dtype = torch.bfloat16
model = AutoModelForCausalLM.from_pretrained(
    'stabilityai/stablelm-2-1_6b',
    quantization_config=bnb_config,
    torch_dtype=torch_dtype,
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(
    'stabilityai/stablelm-2-1_6b',
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token

# https://github.com/huggingface/trl/issues/1073
tokenizer.add_special_tokens({"bos_token": tokenizer.eos_token})
tokenizer.bos_token_id = tokenizer.eos_token_id

train_dataset = get_anthropic_hh_rlhf_dataset(split='train')
test_dataset = get_anthropic_hh_rlhf_dataset(split='test[:1000]') # use a small test set for now

# define the training arguments
training_args = TrainingArguments(
    max_steps=64, # only 64 gradient updates, not even one epoch
    remove_unused_columns=False,
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    output_dir='output',
    logging_strategy='steps',
    logging_dir='logs',
    logging_steps=16,
    lr_scheduler_type='constant' # default is linear
)

peft_config = LoraConfig(
    r=64, # dimension of the low-rank matrices
    lora_alpha=16, # scaling factor for the weight matrices
    bias='none', # don't train bias params
    task_type='CASUAL_LM',
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'o_proj',
        'gate_proj',
        'up_proj',
        'down_proj',
        'lm_head',
    ]
)

model = get_peft_model(model, peft_config)

def tokenize_func(examples):
    return tokenizer(examples['prompt'], examples['chosen'], examples['rejected'], padding=True, truncation=True)

# tokenize the datasets
encoded_dataset_train = train_dataset.map(tokenize_func, batched=True)
encoded_dataset_test = test_dataset.map(tokenize_func, batched=True)

trainer = DPOTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    beta=0.1, # beta param for DPO loss
    train_dataset=encoded_dataset_train,
    eval_dataset=encoded_dataset_test,
    max_length=512, # max length of the input
    max_target_length=128, #
    max_prompt_length=128,
    generate_during_eval=False,
    peft_config=peft_config
)


# Evaluate the model before training.
print(trainer.evaluate())


trainer.train()


# Save and evaluate the model after training.
trainer.save_model('out')
eval_results = trainer.evaluate()
print(eval_results)


# View the training loss after logged steps.
print(trainer.state.log_history)


# turn off warnings for this cell
import warnings
warnings.filterwarnings('ignore')

tokenizer.pad_token = tokenizer.eos_token
tokenizer.to_device()


device = torch.device('cuda')
model = model.to(device)
prompt = 'Some popular cities for tourists are'
input_ids = tokenizer(prompt, return_tensors='pt').input_ids
input_ids = input_ids.to(device)
output_ids = model.generate(input_ids, max_length=128, do_sample=True, num_return_sequences=1)
print(tokenizer.decode(output_ids[0], skip_special_tokens=True))

Direct Preference Optimization (DPO)¶

Use gpu for inference¶