Skip to content

NLP

Let's go through a simple example of integrating the Aporia SDK with a Natural Language Processing (NLP) model.

STEP 1: Add The Model

Click the Add Model button in the Models page.

Enter the model name (for this example we call it "sentiment analysis model") and optionally a description. Click Next.

STEP 2: Train A Model

For this example, we will train a simple sentiment analysis model using NLTK, based on this example.

import re
import string
import uuid

import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# Load tweets data, based on this data set https://www.kaggle.com/crowdflower/twitter-airline-sentiment
tweets = pd.read_csv("/content/Tweets.csv")

# Data pre-processing
SENTIMENT_ORDERING = ["negative", "neutral", "positive"]
MAX_FEATURES = 3000
PUNCT = string.punctuation
STOP_WORDS = stopwords.words("english")


def pre_process_text(input_text: str, stemmer: PorterStemmer) -> str:
    processed_text = re.sub("[^a-zA-Z]", " ", input_text)
    processed_text = processed_text.lower().split()
    processed_text = [
        stemmer.stem(word)
        for word in processed_text
        if (word not in STOP_WORDS) and (word not in PUNCT)
    ]
    return " ".join(processed_text)


tweets_df = tweets.drop(
    tweets[tweets["airline_sentiment_confidence"] < 0.5].index, axis=0
)
X = tweets_df["text"]
y = tweets_df["airline_sentiment"]

nltk.download("stopwords")
stemmer = PorterStemmer()

cleaned_data = [
    pre_process_text(input_text=X.iloc[i], stemmer=stemmer) for i in range(len(X))
]
y = y.apply(lambda x: SENTIMENT_ORDERING.index(x))


cv = CountVectorizer(max_features=MAX_FEATURES, stop_words=["virginamerica", "unit"])
X_fin = cv.fit_transform(cleaned_data).toarray()

# Store the vocabulary for future usage
vocabulary = cv.vocabulary_
cv_v2 = CountVectorizer(max_features=MAX_FEATURES, vocabulary=vocabulary)


# Train the model
model = MultinomialNB()
X_train, X_test, y_train, y_test = train_test_split(X_fin, y, test_size=0.3)
model.fit(X_train, y_train)

STEP 3: Initialize the Aporia SDK & Create Model Version

First, we should initialize aporia and define a version for the new model.

import aporia
aporia.init(token='123', environment='example')

aporia.create_model_version(
  model_id="sentiment-analysis-model",
  model_version="v1",
  model_type="binary",
  raw_inputs={
    "input_text": "text",
  },
  features={
    "processed_input": "vector",
  },
  predictions={
    "sentiment": "string",
  }
)
Note that "text" type is meant for large text inputs (e.g NLP model's raw textual input).

STEP 4: Predict

Since NLP training report is not supported, we will go directly to predictions report.

import uuid

input_text = "This is a great guide, I love it"

predict_stemmer = PorterStemmer()
processed_text = pre_process_text(input_text, predict_stemmer)
processed_input = cv_v2.fit_transform([processed_text]).toarray()

prediction = model.predict(processed_input)[0]

apr_model = aporia.Model(model_id="sentiment-analysis-model", model_version="v1")
apr_model.log_prediction(
    id=str(uuid.uuid4()),
    raw_inputs={
      "input_text": input_text,  
    },
    features={
        "processed_input": processed_input,
    },
    predictions={
        "sentiment": SENTIMENT_ORDERING[prediction]
    }
)

apr_model.flush()