Wednesday 25 November 2020

Compress methods and get the data into a One Hot Encoding Matrix

My data frame includes a purchase. A buyer (buyer_id) can buy several items (item_id). I split the data with splitter() and put it into a dok matrix generate_matrix(). Then I enter this data in method get_train_samples() and then get my x_train, x_test, y_train and y_test.

How can I compress this code? And how can I combine generate_matrix() and get_train_samples() and enter them in a 'real' one hot encoding matrix?

Dataframe:

d = {'purchaseid': [0, 0, 0, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 8, 9, 9, 9, 9],
     'itemid': [ 3, 8, 2, 10, 3, 10, 4, 12, 3, 12, 3, 4, 8, 6, 3, 0, 5, 12, 9, 9, 13, 1, 7, 11, 11]}
df = pd.DataFrame(data=d)

   purchaseid  itemid
0           0       3
1           0       8
2           0       2
3           1      10
4           2       3

Code:

import random
import numpy as np
import pandas as pd
import scipy.sparse as sp


PERCENTAGE_SPLIT = 20
NUM_NEGATIVES = 4
def splitter(df):
  df_ = pd.DataFrame()
  sum_purchase = df['purchaseid'].nunique()
  amount = round((sum_purchase / 100) * PERCENTAGE_SPLIT)

  random_list = random.sample(df['purchaseid'].unique().tolist(), amount)
  df_ = df.loc[df['purchaseid'].isin(random_list)]
  df_reduced = df.loc[~df['purchaseid'].isin(random_list)]
  return [df_reduced, df_]

def generate_matrix(df_main, dataframe, name):
  
  mat = sp.dok_matrix((df_main.shape[0], len(df_main['itemid'].unique())), dtype=np.float32)
  for purchaseid, itemid in zip(dataframe['purchaseid'], dataframe['itemid']):
    mat[purchaseid, itemid] = 1.0

  return mat

dfs = splitter(df)
df_tr = dfs[0].copy(deep=True)
df_val = dfs[1].copy(deep=True)

train_mat = generate_matrix(df, df_tr, 'train')
val_mat = generate_matrix(df, df_val, 'val')

def get_train_samples(train_mat, num_negatives):
    user_input, item_input, labels = [], [], []
    num_user, num_item = train_mat.shape
    for (u, i) in train_mat.keys():
        user_input.append(u)
        item_input.append(i)
        labels.append(1)
        # negative instances
        for t in range(num_negatives):
            j = np.random.randint(num_item)
            while (u, j) in train_mat.keys():
                j = np.random.randint(num_item)
            user_input.append(u)
            item_input.append(j)
            labels.append(0)
    return user_input, item_input, labels

num_users, num_items = train_mat.shape

model = get_model(num_users, num_items, ...)

user_input, item_input, labels = get_train_samples(train_mat, NUM_NEGATIVES)
val_user_input, val_item_input, val_labels = get_train_samples(val_mat, NUM_NEGATIVES)

What I need

  • user_input
  • item_input
  • labels
  • val_user_input
  • val_item_input
  • val_labels
  • num_users


from Compress methods and get the data into a One Hot Encoding Matrix

No comments:

Post a Comment