In [1]:
import numpy as np 
import pandas as pd
In [2]:
short_pos = open("pos.txt","r").read()
short_neg = open("neg.txt","r").read()
In [3]:
documents = []
documents1 = []
documents2 = []

for r in short_pos.split('\n'):
    documents1.append( r )
    documents.append( r )

for r in short_neg.split('\n'):
    documents2.append( r )
    documents.append( r )
In [4]:
from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer(stop_words='english')
vector.fit(documents)
Out[4]:
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)
In [22]:
counts1 = vector.transform(documents1)
array1 = counts1.toarray()
DataFrame1 = np.array(array1)
myDataFrame1 = pd.DataFrame(DataFrame1)
myDataFrame1['class'] = 'pos'
Out[22]:
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)
In [6]:
counts2 = vector.transform(documents2)
array2 = counts2.toarray()
DataFrame2 = np.array(array2)
myDataFrame2 = pd.DataFrame(DataFrame2)
myDataFrame2['class'] = 'neg'
counts2.toarray()
Out[6]:
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)
In [7]:
frames = [myDataFrame1, myDataFrame2]
df = pd.concat(frames)
df = df.sample(frac=1).reset_index(drop=True)
df
Out[7]:
0 1 2 3 4 5 6 7 8 9 ... 18063 18064 18065 18066 18067 18068 18069 18070 18071 class
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
5 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
6 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
7 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
8 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
9 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
10 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
11 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
12 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
13 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
14 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
15 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
16 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
17 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
18 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
19 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
20 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
21 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
22 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
23 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
24 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
25 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
26 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
27 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
28 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
29 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10632 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
10633 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
10634 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
10635 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
10636 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
10637 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
10638 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
10639 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
10640 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
10641 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
10642 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
10643 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
10644 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
10645 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
10646 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
10647 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
10648 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
10649 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
10650 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
10651 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
10652 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
10653 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
10654 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
10655 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
10656 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
10657 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
10658 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 pos
10659 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
10660 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg
10661 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 neg

10662 rows × 18073 columns

In [8]:
#split data into 2 matrix - X and y
X = df.iloc[:, :18072]
y = df.iloc[:,-1]
In [9]:
X = df.iloc[:, :18072].values
y = df.iloc[:,-1].values
X
Out[9]:
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)
In [10]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
X = tfidf.fit_transform(X,y)
print(X.shape)
print(X.toarray())
(10662, 18072)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
In [11]:
#splitting of data into train and test dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
In [12]:
X_train
Out[12]:
<7463x18072 sparse matrix of type '<class 'numpy.float64'>'
	with 72213 stored elements in Compressed Sparse Row format>
In [13]:
# classifierLogisticRegression
from sklearn.linear_model import LogisticRegression
classifierLogisticRegression = LogisticRegression()
classifierLogisticRegression.fit(X_train, y_train)

y_predictorLogisticRegression = classifierLogisticRegression.predict(X_test)
y_predictorLogisticRegression
Out[13]:
array(['pos', 'neg', 'pos', ..., 'neg', 'neg', 'pos'], dtype=object)
In [14]:
from sklearn.svm import SVC
classifierSVC = SVC(kernel='linear')
classifierSVC.fit(X_train, y_train.ravel())

y_predictorSVC = classifierSVC.predict(X_test)
y_predictorSVC
Out[14]:
array(['pos', 'neg', 'pos', ..., 'neg', 'neg', 'pos'], dtype=object)
In [15]:
from sklearn.naive_bayes import MultinomialNB
classifierMultinomialNB = MultinomialNB(alpha=1)
classifierMultinomialNB.fit(X_train, y_train)

y_predictorMultinomialNB = classifierMultinomialNB.predict(X_test)
y_predictorMultinomialNB
Out[15]:
array(['pos', 'neg', 'pos', ..., 'neg', 'neg', 'pos'], dtype='<U3')
In [16]:
#Move to KNN algorithm
from sklearn.neighbors import KNeighborsClassifier
classifierKNeighborsClassifier = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
classifierKNeighborsClassifier.fit(X_train, y_train)

y_predictorKNeighborsClassifier = classifierKNeighborsClassifier.predict(X_test)
y_predictorKNeighborsClassifier
Out[16]:
array(['neg', 'neg', 'neg', ..., 'neg', 'neg', 'pos'], dtype=object)
In [17]:
#fit your training data to NB
from sklearn.naive_bayes import GaussianNB
classifierGaussianNB = GaussianNB()
classifierGaussianNB.fit(X_train.toarray(), y_train)
y_predictorGaussianNB = classifierGaussianNB.predict(X_test.toarray())
y_predictorGaussianNB
Out[17]:
array(['pos', 'neg', 'pos', ..., 'pos', 'neg', 'pos'], dtype='<U3')
In [18]:
# from sklearn.metrics import confusion_matrix
# falseCaci = confusion_matrix(y_test, y_predictorSVC)
# falseCaci
In [19]:
 from sklearn.metrics import accuracy_score
 print("LogisticRegression : ",accuracy_score(y_test, y_predictorLogisticRegression, normalize=True, sample_weight=None))   
 print("SVM(SVC) : ",accuracy_score(y_test, y_predictorSVC, normalize=True, sample_weight=None)) 
 print("MultinomialNB : ",accuracy_score(y_test, y_predictorMultinomialNB, normalize=True, sample_weight=None))  
 print("KNeighbors : ",accuracy_score(y_test, y_predictorKNeighborsClassifier, normalize=True, sample_weight=None))   
 print("GaussianNB: ",accuracy_score(y_test, y_predictorGaussianNB, normalize=True, sample_weight=None))     
LogisticRegression :  0.75054704595186
SVM(SVC) :  0.745232885276649
MultinomialNB :  0.7636761487964989
KNeighbors :  0.5873710534542045
GaussianNB:  0.6308221319162238