I'm trying to use camembert model to just to extract text features. After that, I'm trying to use a KNN classifier to classify the feature vectors as inputs.
This is the code I wrote
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, CamembertModel
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import torch
# Initialize the Camembert tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("camembert-base")
model = CamembertModel.from_pretrained("camembert-base")
# Select the relevant columns from the DataFrame
cols = ["Intitulé (Ce champ doit respecter la nomenclature suivante : Code action – Libellé)_y", "Domaine sou domaine "]
df = df[cols]
# Convert DataFrame to a dictionary with row indices as keys and cell values as values
data = df.to_dict(orient='split')
data = dict(zip(data['index'], data['data']))
# Collect all the input texts into a list of strings
input_texts = [str(text) for text in data.values()]
# Set the batch size for processing
batch_size = 8
# Initialize lists to store the input features and labels
input_features_list = []
input_labels_list = []
# Process the data in batches
for i in range(0, len(input_texts), batch_size):
batch_texts = input_texts[i:i + batch_size]
# Tokenize the batch of texts
inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True)
# Get the model outputs for the batch
with torch.no_grad():
outputs = model(**inputs)
# Extract the last hidden states and convert them to a numpy array
last_hidden_states = outputs.last_hidden_state
input_features = last_hidden_states[:, 0, :].numpy()
# Add the input features and labels to the corresponding lists
input_features_list.append(input_features)
input_labels_list.extend(list(data.keys())[i:i + batch_size])
# Concatenate the input features and labels for all batches
input_features = np.concatenate(input_features_list)
input_labels = input_labels_list
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(input_features, input_labels, test_size=0.2, random_state=42)
# Initialize and train the KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)
# Make predictions on the testing data
y_pred = neigh.predict(X_test)
# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
The data I'm using in the dictionary has this form:
{
'index': [row_index_1, row_index_2, ...],
'columns': [column_name_1, column_name_2, ...],
'data': [
[cell_value_row_1_col_1, cell_value_row_1_col_2, ...],
[cell_value_row_2_col_1, cell_value_row_2_col_2, ...],
...
]
}
When executing the code, it runs for 70 minutes then I get 0.0 Accuracy. Please let me know if want me to provide more details.
from 0.0 Accuracy when using KNN on extracted text features by camemBERT
No comments:
Post a Comment