import numpy as np 
import pandas as pd

short_pos = open("pos.txt","r").read()
short_neg = open("neg.txt","r").read()

documents = []
documents1 = []
documents2 = []

for r in short_pos.split('\n'):
    documents1.append( r )
    documents.append( r )

for r in short_neg.split('\n'):
    documents2.append( r )
    documents.append( r )

from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer(stop_words='english')
vector.fit(documents)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

counts1 = vector.transform(documents1)
array1 = counts1.toarray()
DataFrame1 = np.array(array1)
myDataFrame1 = pd.DataFrame(DataFrame1)
myDataFrame1['class'] = 'pos'

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

counts2 = vector.transform(documents2)
array2 = counts2.toarray()
DataFrame2 = np.array(array2)
myDataFrame2 = pd.DataFrame(DataFrame2)
myDataFrame2['class'] = 'neg'
counts2.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

frames = [myDataFrame1, myDataFrame2]
df = pd.concat(frames)
df = df.sample(frac=1).reset_index(drop=True)
df

#split data into 2 matrix - X and y
X = df.iloc[:, :18072]
y = df.iloc[:,-1]

X = df.iloc[:, :18072].values
y = df.iloc[:,-1].values
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
X = tfidf.fit_transform(X,y)
print(X.shape)
print(X.toarray())

(10662, 18072)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

#splitting of data into train and test dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

X_train

<7463x18072 sparse matrix of type '<class 'numpy.float64'>'
	with 72213 stored elements in Compressed Sparse Row format>

# classifierLogisticRegression
from sklearn.linear_model import LogisticRegression
classifierLogisticRegression = LogisticRegression()
classifierLogisticRegression.fit(X_train, y_train)

y_predictorLogisticRegression = classifierLogisticRegression.predict(X_test)
y_predictorLogisticRegression

array(['pos', 'neg', 'pos', ..., 'neg', 'neg', 'pos'], dtype=object)

from sklearn.svm import SVC
classifierSVC = SVC(kernel='linear')
classifierSVC.fit(X_train, y_train.ravel())

y_predictorSVC = classifierSVC.predict(X_test)
y_predictorSVC

array(['pos', 'neg', 'pos', ..., 'neg', 'neg', 'pos'], dtype=object)

from sklearn.naive_bayes import MultinomialNB
classifierMultinomialNB = MultinomialNB(alpha=1)
classifierMultinomialNB.fit(X_train, y_train)

y_predictorMultinomialNB = classifierMultinomialNB.predict(X_test)
y_predictorMultinomialNB

array(['pos', 'neg', 'pos', ..., 'neg', 'neg', 'pos'], dtype='<U3')

#Move to KNN algorithm
from sklearn.neighbors import KNeighborsClassifier
classifierKNeighborsClassifier = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
classifierKNeighborsClassifier.fit(X_train, y_train)

y_predictorKNeighborsClassifier = classifierKNeighborsClassifier.predict(X_test)
y_predictorKNeighborsClassifier

array(['neg', 'neg', 'neg', ..., 'neg', 'neg', 'pos'], dtype=object)

#fit your training data to NB
from sklearn.naive_bayes import GaussianNB
classifierGaussianNB = GaussianNB()
classifierGaussianNB.fit(X_train.toarray(), y_train)
y_predictorGaussianNB = classifierGaussianNB.predict(X_test.toarray())
y_predictorGaussianNB

array(['pos', 'neg', 'pos', ..., 'pos', 'neg', 'pos'], dtype='<U3')

# from sklearn.metrics import confusion_matrix
# falseCaci = confusion_matrix(y_test, y_predictorSVC)
# falseCaci

 from sklearn.metrics import accuracy_score
 print("LogisticRegression : ",accuracy_score(y_test, y_predictorLogisticRegression, normalize=True, sample_weight=None))   
 print("SVM(SVC) : ",accuracy_score(y_test, y_predictorSVC, normalize=True, sample_weight=None)) 
 print("MultinomialNB : ",accuracy_score(y_test, y_predictorMultinomialNB, normalize=True, sample_weight=None))  
 print("KNeighbors : ",accuracy_score(y_test, y_predictorKNeighborsClassifier, normalize=True, sample_weight=None))   
 print("GaussianNB: ",accuracy_score(y_test, y_predictorGaussianNB, normalize=True, sample_weight=None))

LogisticRegression :  0.75054704595186
SVM(SVC) :  0.745232885276649
MultinomialNB :  0.7636761487964989
KNeighbors :  0.5873710534542045
GaussianNB:  0.6308221319162238

	0	1	2	3	4	5	6	7	8	9	...	18063	18064	18065	18066	18067	18068	18069	18070	18071	class
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
5	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
6	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
7	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
8	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
9	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
10	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
11	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
12	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
13	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
14	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
15	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
16	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
17	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
18	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
19	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
20	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
21	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
22	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
23	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
24	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
25	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
26	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
27	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
28	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
29	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10632	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
10633	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
10634	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
10635	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
10636	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
10637	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
10638	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
10639	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
10640	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
10641	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
10642	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
10643	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
10644	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
10645	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
10646	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
10647	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
10648	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
10649	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
10650	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
10651	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
10652	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
10653	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
10654	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
10655	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
10656	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
10657	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
10658	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	pos
10659	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
10660	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg
10661	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	neg