here is my code
import matplotlib.pyplot as plt
#import nltk
import numpy as np
import pandas as pd
import seaborn as sns
#from wordcloud import WordCloud, STOPWORDS
df = pd.read_csv(ââŚ/âŚ/âŚ/data/datasets/women_reviews.csvâ)
print (df.head())
#print df.shape
for column in [âDivision Nameâ,âDepartment Nameâ,âClass Nameâ,âReview Textâ]:
df = df[df[column].notnull()]
df.drop(df.columns[0], inplace=True, axis=1)
#print df.shape
df[âLabelâ]=0
df.loc[df.Rating>=3, [âLabelâ]] = 1
#print df.head()
cat_dtypes = [âRatingâ,âLabelâ]
increment=0
f, axes = plt.subplots(1, len(cat_dtypes), figsize=(16, 6), sharex=False)
for i in range(len(cat_dtypes)):
sns.countplot(x=cat_dtypes[increment], data=df, order=df[cat_dtypes[increment]].value_counts().index, ax=axes[i])
axes[i].set_title(âFrequency Distribution for\n{}â.format(cat_dtypes[increment]))
axes[i].set_ylabel(âOccurrenceâ)
axes[i].set_xlabel(â{}â.format(cat_dtypes[increment]))
increment += 1
axes[1].set_ylabel(ââ)
#axes[2].set_ylabel(ââ)
plt.savefig(âfreqdist-rating-recommended-label.pngâ, format=âpngâ, dpi=300)
#plt.show()
âââhuevar = âRatingâ
f, axes = plt.subplots(1, 2, figsize=(16, 7))
sns.countplot(x=âRatingâ, hue=âRecommended INDâ, data=df, ax=axes[0])
axes[0].set_title(âOccurrence of {}\nby {}â.format(huevar, âRecommended INDâ))
axes[0].set_ylabel(âCountâ)
percentstandardize_barplot(x=âRatingâ, y=âPercentageâ, hue=âRecommended INDâ, data=df, ax=axes[1])
#axes[1].set_title(âPercentage Normalized Occurrence of {}\nby {}â.format(huevar, âRecommended INDâ))
#axes[1].set_ylabel(â% Percentage by Ratingâ)
plt.savefig(ârating-recommended.pngâ, format=âpngâ, dpi=300)
plt.show()âââ
pd.set_option(âmax_colwidthâ, 300)
#print df[[âTitleâ,âReview Textâ, âRatingâ, âLabelâ]].sample(10)
import os, sys
import re
import string
import pathlib
import random
from collections import Counter, OrderedDict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from tqdm import tqdm, tqdm_notebook, tnrange
tqdm.pandas(desc=âProgressâ)
import torch.cuda
if torch.cuda.is_available():
import torch.cuda as t
else:
import torch as t
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torchtext
from torchtext import data
from torchtext import vocab
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings(âignoreâ)
#device = torch.device(âcuda:0â)
datapath = pathlib.Path(â./datasetsâ)
print (datapath)
df=df.rename(columns={âReview Textâ: âReviewTextâ})
#print df.head()
df[âReviewTextâ]=df.ReviewText.progress_apply(lambda x: re.sub(â\nâ, â ', x))
#split datasets
def split_dataset(df, test_size=0.2):
train, val=train_test_split(df, test_size=test_size, random_state=42)
return train.reset_index(drop=True), val.reset_index(drop=True)
traindf, valdf=split_dataset(df, test_size=0.2)
#shape of traindf, valdf
ââ'print âtrain-shapeâ
print traindf.shape
print traindf.Label.value_counts()
print (âval-shapeâ)
print valdf.Label.value_counts()âââ
#save csv files for training and validation
traindf.to_csv(âtraindf.csvâ, index=False)
valdf.to_csv(âvaldf.csvâ, index=False)
#preprocessing
#print traindf.head()
nlp = spacy.load(âenâ, disable=[âparserâ, âtaggerâ, ânerâ])
def tokenizer(s):
return [ w.text.lower() for w in nlp(tweet_clean(s))]
def tweet_clean(txt):
txt=re.sub(râ[^A-Za-z0-9]+â, â â, txt)
txt=re.sub(râhttps?://\S+â, â ', txt)
return txt.strip()
âââFor text columns or fields, below parameters are used.
âsequential=Trueâ
It tell torchtext that the data is in form of sequence and not discrete
âtokenize=tokenizerâ
This attribute takes a function that will tokenize a given text. In this case the function will tokenize a single tweet. You can also pass âspacyâ string in this attribute if spacy is installed.
âinclude_lengths=Trueâ
Apart from tokenized text we will also need the lengths of the tweets for RNN
âuse_vocab=Trueâ
Since this is used to process the text data, we need to create the vocabulary of unique words. This attribute tells torchtext to create the vocabulary
âââ
ââ'For label columns or fields, below parameters are used.
âsequential=Falseâ
Now we are defining the blueprint of label columns. Labels are not sequential data, they are discrete. So this attribute is false
âuse_vocab=Falseâ
Since it is a binary classification problem and labels are already numericalized, we will set this to false
âpad_token=Noneâ
âunk_token=Noneâ
We donât need padding and out of vocabulary tokens for labels.âââ
#define fields
txt_field=data.Field(sequential=True,tokenize=tokenizer,include_lengths=True,use_vocab=True, postprocessing= lambda x: float(x))
label_field=data.Field(sequential=False, use_vocab=False,pad_token=None,unk_token=None,postprocessing=data.Pipeline(lambda x: float(x)))
train_val_fields=[
(âClothing IDâ, None),
(âAgeâ, None),
(âTitleâ, None),
(âReviewTextâ, txt_field),
(âRatingâ,None),
(âRecommended INDâ,None),
(âPositive Feedback Countâ,None),
(âDivision Nameâ, None),
(âDepartment Nameâ, None),
(âClass Nameâ,None),
(âLabelâ, label_field)]
âââpath=â./dataâ
Path were the csv or tsv files are stores
format=âcsvâ
format of the files that will be loaded and processed
train=âtraindf.csvâ
Name of train file. The final path will become ./data/traindf.csv
validation=âvaldf.csvâ
Name of validation file. The final path will become ./data/valdf.csv
fields=train_val_fields
Tell torchtext how the coming data will be processed
skip_header=True
skip the first line in the csv, if it contains headerâââ
trainds, valds = data.TabularDataset.splits(path=ââ,format=âcsvâ,train=âtraindf.csvâ,validation=âvaldf.csvâ,fields=train_val_fields,skip_header=True)
print (type(trainds))
print ((len(trainds), len(valds)))
print (trainds.fields.items())
example = trainds[0]
print (type(example))
print (type(example.ReviewText))
print (type(example.Label))
#load pretrained word vectors
from torchtext import vocab
#vec = vocab.Vectors(âglove.42B.300d.txtâ, ââŚ/âŚ/âŚ/data/â)
vec = vocab.GloVe(name=âtwitter.27Bâ, dim=100)
print (vec)
txt_field.build_vocab(trainds, valds,max_size=100000, vectors=vec)
#build vocab for labels
#label_field.build_vocab(trainds)
print (txt_field.vocab.vectors.shape)
#print (txt_field.vocab.vectros[txt_field.vocab.stoi[âawesomeâ]])