I am trying to train a DecisionTreeClassifier:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
dataset = pd.read_csv('./vgsales.csv')
X = dataset[["Name"]]
Y = dataset[["Global_Sales"]]
model = DecisionTreeClassifier()
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)
model.fit(X_train,Y_train)
When running model.fit
line, I am getting the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-58-84aa7640ed28> in <module>
12 X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)
13
---> 14 model.fit(X_train,Y_train)
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/tree/_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
896 """
897
--> 898 super().fit(
899 X, y,
900 sample_weight=sample_weight,
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/tree/_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
154 check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
155 check_y_params = dict(ensure_2d=False, dtype=None)
--> 156 X, y = self._validate_data(X, y,
157 validate_separately=(check_X_params,
158 check_y_params))
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
428 # :(
429 check_X_params, check_y_params = validate_separately
--> 430 X = check_array(X, **check_X_params)
431 y = check_array(y, **check_y_params)
432 else:
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
614 array = array.astype(dtype, casting="unsafe", copy=False)
615 else:
--> 616 array = np.asarray(array, order=order, dtype=dtype)
617 except ComplexWarning as complex_warning:
618 raise ValueError("Complex data not supported\n"
~/opt/anaconda3/lib/python3.8/site-packages/numpy/core/_asarray.py in asarray(a, dtype, order, like)
100 return _asarray_with_like(a, dtype=dtype, order=order, like=like)
101
--> 102 return array(a, dtype, copy=False, order=order)
103
104
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in __array__(self, dtype)
1897
1898 def __array__(self, dtype=None) -> np.ndarray:
-> 1899 return np.asarray(self._values, dtype=dtype)
1900
1901 def __array_wrap__(
~/opt/anaconda3/lib/python3.8/site-packages/numpy/core/_asarray.py in asarray(a, dtype, order, like)
100 return _asarray_with_like(a, dtype=dtype, order=order, like=like)
101
--> 102 return array(a, dtype, copy=False, order=order)
103
104
ValueError: could not convert string to float: 'Tiger Woods PGA Tour 14'
I have already tried to use the code mentioned in this link -> sklearn-LinearRegression: could not convert string to float: '--'
When I use apply() method to convert to numeric/float, all my data values are changing to NaN.
The dataset I am using is https://www.kaggle.com/gregorut/videogamesales