-
Notifications
You must be signed in to change notification settings - Fork 0
/
classifier.py
83 lines (67 loc) · 1.92 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
import os
import numpy as np
np.set_printoptions(precision=4)
import catboost
from catboost import *
from catboost import datasets
# import sklearn train test split
from sklearn.model_selection import train_test_split
data = pd.read_csv("old_data/data.csv")
train_df, test_df = train_test_split(data, test_size=0.1, random_state=476)
y_train = train_df['label']
# for X, remove both the label and entry_id columns
X_train = train_df.drop(['label'], axis=1)
cat_features = list(range(0, X_train.shape[1]))
dataset_dir = 'ProjectK'
if not os.path.exists(dataset_dir):
os.makedirs(dataset_dir)
# We will be able to work with files with/without header and
# with different separators.
train_df.to_csv(
os.path.join(dataset_dir, 'train.tsv'),
index=False, sep='\t', header=False
)
test_df.to_csv(
os.path.join(dataset_dir, 'test.tsv'),
index=False, sep='\t', header=False
)
train_df.to_csv(
os.path.join(dataset_dir, 'train.csv'),
index=False, sep=',', header=True
)
test_df.to_csv(
os.path.join(dataset_dir, 'test.csv'),
index=False, sep=',', header=True
)
from catboost.utils import create_cd
feature_names = dict()
for column, name in enumerate(train_df):
if column == 0:
continue
feature_names[column - 1] = name
create_cd(
label=0,
cat_features=list(range(1, train_df.columns.shape[0])),
feature_names=feature_names,
output_path=os.path.join(dataset_dir, 'train.cd')
)
train_df = train_df.astype(str)
test_df = test_df.astype(str)
y_train = train_df['label']
X_train = train_df.drop(['label'], axis=1)
y_test = test_df['label']
X_test = test_df.drop(['label'], axis=1)
from catboost import CatBoostClassifier
model = CatBoostClassifier(
iterations=256,
random_seed= 511,
loss_function='MultiClass'
)
model.fit(
X_train, y_train,
cat_features=cat_features,
#eval_set=(X_validation, y_validation),
verbose=False,
plot=True
)