Using some guidance from this scikit-learn tutorial on understanding decision tree structures, I had the idea that perhaps looking at combinations of features occurring between two connected nodes might give some insight as to potential "interaction" terms. That is, by looking at how often a given feature y follows a given feature x, we might be able to determine whether there's some higher order interaction between x and y, versus other variables in the model.
Here is my setup. Basically this object just parses the structure of the tree and makes it easy for us to traverse nodes and determine what's happening at each node.
import numpy as np
class TreeInteractionFinder(object):
def __init__(
self,
model,
feature_names = None):
self.model = model
self.feature_names = feature_names
self._parse_tree_structure()
self._node_and_leaf_compute()
def _parse_tree_structure(self):
self.n_nodes = self.model.tree_.node_count
self.children_left = self.model.tree_.children_left
self.children_right = self.model.tree_.children_right
self.feature = self.model.tree_.feature
self.threshold = self.model.tree_.threshold
self.n_node_samples = self.model.tree_.n_node_samples
self.predicted_values = self.model.tree_.value
def _node_and_leaf_compute(self):
''' Compute node depth and whether each node is a leaf '''
node_depth = np.zeros(shape=self.n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=self.n_nodes, dtype=bool)
# Seed is the root node id and its parent depth
stack = [(0, -1)]
while stack:
node_idx, parent_depth = stack.pop()
node_depth[node_idx] = parent_depth + 1
# If we have a test (where "test" means decision-test) node
if self.children_left[node_idx] != self.children_right[node_idx]:
stack.append((self.children_left[node_idx], parent_depth + 1))
stack.append((self.children_right[node_idx], parent_depth + 1))
else:
is_leaves[node_idx] = True
self.is_leaves = is_leaves
self.node_depth = node_depth
Next, I'll train a somewhat-deep tree on some dataset. The Boston housing dataset gave me some interesting results, and thus I've used it in my example:
from sklearn.datasets import load_boston as load_dataset
from sklearn.tree import DecisionTreeRegressor as model
bunch = load_dataset()
X, y = bunch.data, bunch.target
feature_names = bunch.feature_names
model = model(
max_depth=20,
min_samples_leaf=2
)
model.fit(X, y)
finder = TreeInteractionFinder(model, feature_names)
from collections import defaultdict
feature_combos = defaultdict(int)
# Traverse the tree fully, counting the occurrences of features at the current and next indices
for idx in range(finder.n_nodes):
curr_node_is_leaf = finder.is_leaves[idx]
curr_feature = finder.feature_names[finder.feature[idx]]
if not curr_node_is_leaf:
# Test to see if we're at the end of the tree
try:
next_idx = finder.feature[idx + 1]
except IndexError:
break
else:
next_node_is_leaf = finder.is_leaves[next_idx]
if not next_node_is_leaf:
next_feature = finder.feature_names[next_idx]
feature_combos[frozenset({curr_feature, next_feature})] += 1
from pprint import pprint
pprint(sorted(feature_combos.items(), key=lambda x: -x[1]))
pprint(sorted(zip(feature_names, model.feature_importances_), key=lambda x: -x[1]))
Which yields:
$ python3 *py
[(frozenset({'AGE', 'LSTAT'}), 4),
(frozenset({'RM', 'LSTAT'}), 3),
(frozenset({'AGE', 'NOX'}), 3),
(frozenset({'NOX', 'CRIM'}), 3),
(frozenset({'NOX', 'DIS'}), 3),
(frozenset({'LSTAT', 'DIS'}), 2),
(frozenset({'AGE', 'RM'}), 2),
(frozenset({'AGE', 'DIS'}), 2),
(frozenset({'TAX', 'DIS'}), 1),
(frozenset({'RM', 'INDUS'}), 1),
(frozenset({'PTRATIO'}), 1),
(frozenset({'NOX', 'PTRATIO'}), 1),
(frozenset({'LSTAT', 'CRIM'}), 1),
(frozenset({'RM'}), 1),
(frozenset({'TAX', 'PTRATIO'}), 1),
(frozenset({'NOX'}), 1),
(frozenset({'DIS', 'CRIM'}), 1),
(frozenset({'AGE', 'PTRATIO'}), 1),
(frozenset({'AGE', 'CRIM'}), 1),
(frozenset({'ZN', 'DIS'}), 1),
(frozenset({'ZN', 'CRIM'}), 1),
(frozenset({'CRIM', 'PTRATIO'}), 1),
(frozenset({'RM', 'CRIM'}), 1)]
[('RM', 0.60067090411997),
('LSTAT', 0.22148824141475706),
('DIS', 0.068263421165279),
('CRIM', 0.03893906506019243),
('NOX', 0.028695328014265362),
('PTRATIO', 0.014211478583574726),
('AGE', 0.012467751974477529),
('TAX', 0.011821058983765207),
('B', 0.002420619208623876),
('INDUS', 0.0008323703650693053),
('ZN', 0.00018976111002551332),
('CHAS', 0.0),
('RAD', 0.0)]
After adding the criterion to exclude "next" nodes that are leaves, the results seem to improve.
Now, one combination of features that immediately jumps out as occurring very frequently is frozenset({'CRIM', 'B'}) - that is, the combination of crime rate and rate of African-Americans in a particular neighborhood. But if you look more closely at the feature combos, the B variable occurs in just about every potentially meaningful combination. But B has very little importance, according to the latter output. Now, that's not really an apples-to-apples comparison, as perhaps the explicit interaction of B with these other features might help to mirror the combo "importance," of course.
Is this even barking up the right tree (pun maybe intended)? Does counting combinations of sequential features in a given tree speak to potential interactions in a model?
from Current node to next node feature combinations in machine learning: useful to determine potential interactions?
No comments:
Post a Comment