minthigpen.github.io/util.py at master · minthigpen/minthigpen.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from __future__ import division
from collections import defaultdict
import numpy

def print_most_informative_features(classifier_type, vectorizer, classifier, n=10):
	if classifier_type == 'nb':
		print_most_informative_features_using_prob(vectorizer, classifier, n)
	elif (classifier_type == 'log') or (classifier_type == 'svm'):
		print_most_informative_features_using_coef(vectorizer, classifier, n)
	else:
		raise Exception('Unrecognized classifier!')

def print_most_informative_features_using_prob(vectorizer, classifier, n=10):
	"""
	-- nltk style
	Return a list of the 'most informative' features used by this
	classifier.  For the purpose of this function, the
	informativeness of a feature C{(fname,fval)} is equal to the
	highest value of P(fname=fval|label), for any label, divided by
	the lowest value of P(fname=fval|label), for any label::

	max[ P(fname=fval|label1) / P(fname=fval|label2) ]
	"""
	# The set of (fname, fval) pairs used by this classifier.
	features = set()
	# The max & min probability associated w/ each (fname, fval)
	# pair.  Maps (fname,fval) -> float.
	maxprob = defaultdict(lambda: 0.0)
	minprob = defaultdict(lambda: 1.0)

	for probdist in classifier.feature_log_prob_:
		probdist = numpy.e**(probdist)
		for (i, p) in enumerate(probdist):
			feature = i
			features.add(feature)
			maxprob[feature] = max(p, maxprob[feature])
			minprob[feature] = min(p, minprob[feature])
			if minprob[feature] == 0:
				features.discard(feature)

	# Convert features to a list, & sort it by how informative features are.
	features = sorted(features, key=lambda feature: minprob[feature]/maxprob[feature])
	feature_names = vectorizer.get_feature_names()
	n0 = n1 = 0
	v0 = []
	v1 = []
	for i in features:
		if (n0 >= n) and (n1 >= n):
			break
		p0 = numpy.e**(classifier.feature_log_prob_[0][i])
		p1 = numpy.e**(classifier.feature_log_prob_[1][i])
		if p0 == 0:
			continue
		else:
			ratio = round(p1 / p0, 4)
		if ratio < 1:
			if n0 >= n:
				continue
			n0 += 1
			v0.append((-1/ratio, feature_names[i]))
		else:
			if n1 >= n:
				continue
			n1 += 1
			v1.append((ratio, feature_names[i]))

	top = zip(v0, v1)
	for (c1,f1),(c2,f2) in top:
		print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1,f1,c2,f2))

def print_most_informative_features_using_coef(vectorizer, classifier, n=10):
	c_f = sorted(zip(classifier.coef_[0], vectorizer.get_feature_names()))
	top = zip(c_f[:n], c_f[:-(n+1):-1])
	for (c1,f1),(c2,f2) in top:
		print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1,f1,c2,f2))