-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
213 lines (176 loc) · 6.74 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import pandas as pd
import re
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Load datasets
true1 = pd.read_csv("dataset/True.csv")
true2 = pd.read_csv("dataset/second_set/real.csv")
false1 = pd.read_csv("dataset/Fake.csv")
false2 = pd.read_csv("dataset/second_set/fake.csv")
both_news = pd.read_csv("dataset/WELFake_Dataset.csv/WELFake_Dataset.csv")
# Combine title and text columns
true1["text"] = true1["title"] + " " + true1["text"]
false1["text"] = false1["title"] + " " + false1["text"]
both_news["text"] = both_news["title"] + " " + both_news["text"]
# Assign categories for classification
true1["category"] = 1
true2["category"] = 1
false1["category"] = 0
false2["category"] = 0
both_news.rename(columns={"Label": "category"}, inplace=True) # Rename label column for new dataset
# Concatenate true and fake datasets
true = pd.concat([true1, true2]).reset_index(drop=True)
false = pd.concat([false1, false2]).reset_index(drop=True)
# check
# print(true.head())
# print(false.tail())
# print(true.shape)
# print(false.shape)
# print(both_news.shape)
# Handle missing values
def handle_missing_values(dframe):
return dframe.dropna()
# Apply missing values handling
true = handle_missing_values(true)
false = handle_missing_values(false)
both_news = handle_missing_values(both_news)
# Combine true and fake news datasets
news = pd.concat([true, false, both_news]).reset_index(drop=True)
# print(news.describe())
# print(news.columns)
# print(news.shape)
# print(news.info())
# Preprocess the text data
def preprocess_text(txt):
# Check if the value is NaN
if pd.isna(txt):
return ""
# Convert into lowercase
txt = txt.lower()
# remove URLs
txt = re.sub(r'https?://\S+|www\.\S+', '', txt)
# remove HTML tags
txt = re.sub(r'<.*?>', '', txt)
# remove punctuation
txt = re.sub(r'[^\w\s]', '', txt)
# remove digits
txt = re.sub(r'\d', '', txt)
# remove newline characters
txt = re.sub(r'\n', ' ', txt)
return txt
# checking if there is empty string in TEXT column
blanks = []
# index,label and review of the doc
for index, text in news["text"].items(): # it will iter through index,label and review
if text.isspace(): # if there is a space
blanks.append(index) # it will be noted down in empty list
# print(len(blanks))
# instead of dropping these values we are going to merge title with text
news["text"] = news["title"] + news["text"]
# we only need two columns rest can be-ignore
news = news[["text", "category"]]
# Shuffle the dataset
news = news.sample(frac=1).reset_index(drop=True)
# creating instance
lemma = WordNetLemmatizer()
# creating list of stopwords containing stopwords from spacy and nltk
# stopwords of NLTK
Stopwords = stopwords.words('english')
# print(len(Stopwords))
def clean_text(text):
string = ""
# lower casing
text = text.lower()
# simplifying text
text = re.sub(r"i'm", "i am", text)
text = re.sub(r"he's", "he is", text)
text = re.sub(r"she's", "she is", text)
text = re.sub(r"that's", "that is", text)
text = re.sub(r"what's", "what is", text)
text = re.sub(r"where's", "where is", text)
text = re.sub(r"\'ll", " will", text)
text = re.sub(r"\'ve", " have", text)
text = re.sub(r"\'re", " are", text)
text = re.sub(r"\'d", " would", text)
text = re.sub(r"won't", "will not", text)
text = re.sub(r"can't", "cannot", text)
# removing any special character
text = re.sub(r"[-()\"#!@$%^&*{}?.,:]", " ", text)
text = re.sub(r"\s+", " ", text)
text = re.sub('[^A-Za-z0-9]+', ' ', text)
for word in text.split():
if word not in Stopwords:
string += lemma.lemmatize(word) + " "
return string
# apply both our functions for cleaning
news['text'] = news['text'].apply(preprocess_text)
news['text'] = news['text'].apply(clean_text)
# Remove rows with blank text
news = news[news['text'].str.strip() != ""]
# Feature-Extraction & Model building
x = news['text']
y = news['category']
# Check for missing values in the target variable
# print(news.isna().sum()*100/len(news))
if y.isnull().values.any():
# print("Target variable contains missing values. Handling missing values...")
news = news.dropna(subset=['category'])
x = news['text']
y = news['category']
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
# Create and train the SVM model using a pipeline
text_clf = Pipeline([("tfidf", TfidfVectorizer()), ("clf", LinearSVC())])
text_clf.fit(x_train, y_train)
# Make predictions and evaluate the SVM model
# this pipe-line will take the text and vectorise it , and then TF-IDF, then fitting the model
predictions = text_clf.predict(x_test)
print("SVM Model")
# confusion matrix
print("Confusion matrix: ")
print(metrics.confusion_matrix(y_test, predictions))
# overall accuracy
print("Overall accuracy: ", metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test, predictions))
# Vectorize the text data for other models
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)
# Define and train other classification models
models = {
"Logistic Regression": LogisticRegression(),
"Decision Tree Classifier": DecisionTreeClassifier(),
"Random Forest Classifier": RandomForestClassifier(),
"Gradient Boosting Classifier": GradientBoostingClassifier()
}
for model_name, model in models.items():
model.fit(xv_train, y_train)
predictions = model.predict(xv_test)
print(model_name)
print("Accuracy:", model.score(xv_test, y_test))
print(classification_report(y_test, predictions))
# Manual testing function
def output_label(label):
return "Genuine News" if label == 1 else "Fake News"
def manual_testing(text):
preprocessed_text = preprocess_text(text)
new_xv_test = vectorization.transform([preprocessed_text])
predictions = {}
for model_name, model in models.items():
prediction = model.predict(new_xv_test)
predictions[model_name] = output_label(prediction[0])
return predictions
# Example of manual testing
input_text = input("Enter your Article: ")
news_article = manual_testing(input_text)
for model_name, prediction in news_article.items():
print(f"{model_name} Prediction: {prediction}")