# Data Collect

In [225]:
import pandas as pd
import pickle
import numpy as np

#read data
df = pd.read_excel('reportshitremoved.xlsx')
try:
    df.rename(columns={'ID': 'id'}, inplace=True)
except:
    pass
df.to_pickle("dt_initial.pkl")

In [219]:
#Keys
multiplier = 1

keys = pd.read_excel('keys.xlsx')
keys.insert(1, "id", "0")
keys.insert(1, "title", "keys")
keys.insert(1, "Subject", "keys")
keys.rename(columns={'key': 'content'}, inplace=True)
keys = keys[['id', 'title', 'Subject', 'category', 'content']]
keys = keys.append([keys]*multiplier)

# Pre-processing

1. Clean Data

In [226]:
import re
import string
def clean_text_round1(text):
    #Make text lowercase, remove leading spaces
    text = text.strip().lower()
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    #remove text in square brackets
    text = re.sub('\[.*?\]', '', text)
    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text) 
    #remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    #remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)
    #Get rid of some additional punctuation
    text = re.sub('[‘’“”…]', '', text)
    #remove next line '\n'
    text = re.sub('\n', '', text)
    #remove duplicate white spaces
    text = re.sub(' +', ' ', text)
    return text
round1 = lambda x: clean_text_round1(x)

In [227]:
df_clean = pd.DataFrame(df.content.apply(round1))
df_temp = df.copy()
df_temp.rename(columns={'content': 'content_old'}, inplace=True)
df_clean = pd.concat([df_temp, df_clean], axis=1)
df_clean=df_clean[df_clean['content'].map(len) > 13]
df = df_clean

In [182]:
#Keys
keys_clean = pd.DataFrame(keys.content.apply(round1))
keys_temp = keys.copy()
keys_temp.rename(columns={'content': 'content_old'}, inplace=True)
keys_clean = pd.concat([keys_temp, keys_clean], axis=1)

2. Stemming and Lemmatization

In [223]:
#################################
#Should I use stemming and lemmatization?
#Accuract decreases by 0.3 - 1.0% by using stemming and lemmatization

In [224]:
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import nltk

lem = WordNetLemmatizer()
stem = PorterStemmer()

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
def lemmatize_text(text):
    wd = ""    
    for w in w_tokenizer.tokenize(text):
        wd = wd + " " + lem.lemmatize(w, "v")        
    return wd

def stem_text(text):
    wds = ""
    for w in w_tokenizer.tokenize(text):        
        wds = wds + " " + stem.stem(w)
    return wds

df['lemm_content'] = df.content.apply(lemmatize_text)
df['stem_content'] = df.content.apply(stem_text)
df.rename(columns={'content': 'content_old1'}, inplace=True)
df.rename(columns={'lemm_content': 'content'}, inplace=True)
df.to_pickle("dt_clean.pkl")

3. Category_id

In [183]:
#Keys
df = df.append(keys_clean)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [228]:
#Define categories
df['category_id'] = df['category'].factorize()[0]
category_id_df = df[['category', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'category']].values)
category_id_df.to_pickle("category_id_df.pkl")
with open('id_to_category.pickle', 'wb') as handle:
    pickle.dump(id_to_category, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('category_to_id.pickle', 'wb') as handle:
    pickle.dump(category_to_id, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [229]:
df_clean.to_pickle("dt_clean.pkl")

4. Define Stop Words

In [230]:
add_stop_words = ['sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sept', 'oct', 'nov', 'dec','january','february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 'please', 'see', 'attached', 'attach', 'gm', 'good','morning','dear', 'regard', 'regards','team','thank','you','thanks','folks', 'folk', 'ideas', 'hello','hi','today','yesterday','urgently','afternoon','evening','night','like','im','asap','happy','want']
nonsense_words = ['aaa', 'ap', 'aarp', 'aaaaarp', 'abdelwahed', 'abdo', 'abdullah', 'abhishek', 'zrhzq', 'bslzh']
add_stop_words = add_stop_words + nonsense_words
#add_stop_words = ['sun']

In [231]:
from sklearn.feature_extraction import text 
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)
df_n=df.groupby('category_id', as_index=False)['content'].apply(' '.join)
df_n.reset_index(name='content')
df_m = df_n.to_frame(name='content')

#CountVectorizer: Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(df_m.content)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = df_m.index
data_dtm = data_dtm.transpose()

In [232]:
data = data_dtm
# Find the top 30 words by each category
top_dict = {}
for c in data.columns:
    top = data[c].sort_values(ascending=False).head(60)
    top_dict[c]= list(zip(top.index, top.values))

# Look at the most common top words --> add them to the stop word list
from collections import Counter

# Let's first pull out the top 30 words for each category
words = []
for c in data.columns:
    top = [word for (word, count) in top_dict[c]]
    for t in top:
        words.append(t)
        
# If more than half of the categories have it as a top word, exclude it from the list
common_stop_words = [word for word, count in Counter(words).most_common() if count > 3]

In [233]:
#####
common_stop_words

['hotel',
 'issue',
 'property',
 'check',
 'kind',
 'help',
 'best',
 'data',
 'room',
 'look',
 'advise',
 'day',
 'need',
 'know',
 'care',
 'report',
 'rate',
 'pricing',
 'case',
 'showing',
 'type',
 'days',
 'configuration',
 'screen',
 'alert',
 'error',
 'rates',
 'message',
 'able',
 'file',
 'investigate',
 'date',
 'rooms',
 'revenue',
 'bar',
 'group',
 'forecast',
 'new',
 'change',
 'user',
 'let',
 'correct',
 'upload',
 'time']

In [234]:
#add_stop_words = add_stop_words + common_stop_words

In [235]:
from sklearn.feature_extraction import text 
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

with open('data.pickle', 'wb') as f:
    pickle.dump(stop_words, f, protocol=pickle.HIGHEST_PROTOCOL)

5. Integrate Wordnet

In [236]:
#####################################
#How do I use wordnet to improve BOW for text classification?

In [237]:
#import nltk
#nltk.download('wordnet')

# Feature Engineering

In [238]:
#####################################
#Feature engineering algo:
#    tfidf word level       .
#    tfidf n-gram level     .currently using
#    tfidf char level       .low accuracy
#    count vectorizer       .very basic vectorizer
#    Word Embedding         .advanced, new, supposed to pump high accuracies, algos: Word2Vec, GloVe

In [213]:
#tfidf word level, 86-89
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer='word', sublinear_tf=True, norm='l2', encoding='latin-1', stop_words=stop_words)

features = tfidf.fit_transform(df.content).toarray()
labels = df.category_id
features.shape

(5285, 7928)

In [239]:
#tfidf n-gram level, 79-86, 82-87
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer='word', sublinear_tf=True, norm='l2', encoding='latin-1', ngram_range=(1,2), stop_words=stop_words)

features = tfidf.fit_transform(df.content).toarray()
labels = df.category_id
features.shape

data_tfidf = tfidf.fit_transform(df.content)
data_dtm = pd.DataFrame(data_tfidf.toarray(), columns=tfidf.get_feature_names())
data_dtm.index = df.index
data_dtm = data_dtm.transpose()
data_dtm.sort_values(by=1, ascending=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3862,3863,3864,3865,3866,3867,3868,3869,3870,3871
remove,0.0,0.203414,0.125941,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.220697,0.220697,0.220697,0.0,0.0,0.0,0.000000,0.0
transaction recorded,0.0,0.197936,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0
zhu hotel,0.0,0.197936,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0
advised previously,0.0,0.197936,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0
recorded possible,0.0,0.197936,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0
remove market,0.0,0.197936,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0
longer used,0.0,0.197936,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0
records segment,0.0,0.197936,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0
segment longer,0.0,0.197936,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0
segment zhu,0.0,0.197936,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0


In [395]:
##tfidf char level, 86-89
#from sklearn.feature_extraction.text import TfidfVectorizer
#tfidf = TfidfVectorizer(analyzer='char', sublinear_tf=True, norm='l2', encoding='latin-1', ngram_range=(2,3))

#features = tfidf.fit_transform(df.content).toarray()
#labels = df.category_id
#features.shape

#data_tfidf = tfidf.fit_transform(df.content)
#data_dtm = pd.DataFrame(data_tfidf.toarray(), columns=tfidf.get_feature_names())
#data_dtm.index = df.index
#data_dtm = data_dtm.transpose()
#data_dtm.sort_values(by=1, ascending=False)

In [326]:
#CountVectorizer: Bag of Words, 85-87
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(df.content)
features = cv.fit_transform(df.content).toarray()
labels = df.category_id
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = df.index
data_dtm = data_dtm.transpose()
data_dtm.sort_values(by=1, ascending=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2549,2550,2551,2552,2553,2554,2555,2556,2557,2558
error,0,2,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
software,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
management,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
devices,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
laptop,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
showing,1,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
trying,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
pricing,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
urgent,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
pogoretski,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Word embedding

In [None]:
from sklearn import model_selection, preprocessing
from sklearn import decomposition, ensemble

#import xgboost, textblob, string
from keras.preprocessing import sequence
from keras.preprocessing import text as text_keras
from keras import layers, models, optimizers

In [None]:
# load the pre-trained word-embedding vectors, 50
embeddings_index = {}
for i, line in enumerate(open('wiki-news-300d-1M.vec', encoding='utf-8', newline='\n', errors='ignore')):
    values = line.split()
    embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')
#text="The first line of the file contains the number of words in the vocabulary and the size of the vectors."
# create a tokenizer 
token = text_keras.Tokenizer()
token.fit_on_texts(df.content)
word_index = token.word_index

In [None]:
# convert text to sequence of tokens and pad them to ensure equal length vectors 
#X_train = sequence.pad_sequences(token.texts_to_sequences(X_train))
#X_test = sequence.pad_sequences(token.texts_to_sequences(X_test))
features = sequence.pad_sequences(token.texts_to_sequences(df.content))
labels = df.category_id

# create token-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
#pickle.dump(word_index, open("word_index.pkl", 'wb'))
#pickle.dump(embedding_matrix, open("embedding_matrix.pkl", 'wb'))

# Train Test Split

In [240]:
#25%
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.20)

In [241]:
###################################################
#to counter imbalanced data should i oversample data using these oversampling algos?

In [242]:
#Oversampling
from imblearn.over_sampling import ADASYN
adas = ADASYN(n_neighbors = 5, n_jobs=4)

from imblearn.over_sampling import SMOTE
sm = SMOTE(n_jobs=4)

from imblearn.over_sampling import RandomOverSampler
rm = RandomOverSampler()

from imblearn.combine import SMOTEENN
smten = SMOTEENN()

from imblearn.combine import SMOTETomek
smtok = SMOTETomek()

#X_train, y_train = sm.fit_sample(X_train, y_train)
#features, labels = adas.fit_sample(features, labels)

In [243]:
np.save('X_train', X_train)
np.save('y_train', y_train)
np.save('features', features)
np.save('labels', labels)
np.save('X_test', X_test)
np.save('y_test', y_test)
np.save('indices_train', indices_train)
np.save('indices_test', indices_test)
features.shape

(3867, 44740)