pythonのscikit-learnでgrid search(テキスト分類)
scikit-learnの復習のため、グリッドサーチしてみた。
テキスト分類です。
タスク
テキストのマルチクラス(20クラス)問題
方法:
- TFIDFのBoWでベクトル作成
- 線形分類問題として20クラス分類(one vs the rest)
グリッドサーチのパタン
- TF-IDF:1-gram or 1-gram+2-gram、単語頻度 or 単語出現(Binary)、idfを使う or 使わない、正規化しない or L1
- 線形分類:損失がhinge or 2乗誤差、正則化 L1 or L2
結果
経験則から以下のパラメタがよさそうと思っていた。
- TF-IDF:1-gramでBinaryでidfなしで正規化をL1
- 線形分類:2乗誤差で正則化L1
でも、、、
- TF-IDF:1-gram+2-gram、頻度、idfあり、正規化をL2
- 線形分類:2乗誤差、正則化L2
他の結果とも比較すると、線形分類のパラメタは感覚とあう。
ただ、単語頻度+IDFが全体的には良い結果になっている。ここはちょっと感覚と違う。
計算時間
64パタンを10分割したので640モデルを作成したことになる。
うちのマシンでは10時間くらいかかった、、、。
感想
パラメタによって結果がかなり変わっているのが分かる。一番良かったものと悪かったものの差は20ポイントもある。
ちゃんと測ってあげないとだめだね。
コード
#!/usr/bin/python # encoding: utf-8 ''' -- GridSearh for scikit learn - LinearSVC with TextData @author: mzi @copyright: 2017 mzi. All rights reserved. @license: Apache Licence 2.0 ''' from __future__ import print_function import sys import os import numpy as np from optparse import OptionParser from time import time from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer from sklearn.svm import LinearSVC from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn.utils.extmath import density from sklearn import metrics __all__ = [] __version__ = 0.1 __date__ = '2017-02-11' __updated__ = '2017-02-11' TESTRUN = 0 PROFILE = 0 def size_mb(docs): return sum(len(s.encode('utf-8')) for s in docs) / 1e6 def trim(s): """Trim string to fit on terminal (assuming 80-column display)""" return s if len(s) <= 80 else s[:77] + "..." def benchmark(clf, train_data, train_target, test_data, test_target, target_names, opts): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(train_data, train_target) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(test_data) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(test_target, pred) print("accuracy: %0.3f" % score) feature_names = None if isinstance(clf, GridSearchCV): clf = clf.best_estimator_ if isinstance(clf, Pipeline): vect = clf.steps[0][1] if isinstance(vect, TfidfVectorizer) or isinstance(vect, CountVectorizer): feature_names = np.asarray(vect.get_feature_names()) clf = clf.steps[-1][1] if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if opts.print_top10 and feature_names is not None: print("top 10 keywords per class:") for i, label in enumerate(target_names): top10 = np.argsort(clf.coef_[i])[-10:] print(trim("%s: %s" % (label, " ".join(feature_names[top10])))) print() if opts.print_report: print("classification report:") print(metrics.classification_report(test_target, pred, target_names=target_names)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(test_target, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time def main(argv=None): '''Command line options.''' program_name = os.path.basename(sys.argv[0]) program_version = "v%f" % __version__ program_build_date = "%s" % __updated__ program_version_string = '%%prog %s (%s)' % (program_version, program_build_date) program_longdesc = 'GridSearh for scikit learn - LinearSVC with TextData' program_license = "Copyright 2017 mzi \ Licensed under the Apache License 2.0\nhttp://www.apache.org/licenses/LICENSE-2.0" if argv is None: argv = sys.argv[1:] # setup option parser op = OptionParser(version=program_version_string, epilog=program_longdesc, description=program_license) op.add_option("--report", action="store_true", dest="print_report", help="Print a detailed classification report.") op.add_option("--confusion_matrix", action="store_true", dest="print_cm", help="Print the confusion matrix.") op.add_option("--top10", action="store_true", dest="print_top10", help="Print ten most discriminative terms per class" " for every classifier.") op.add_option("--all_categories", action="store_true", dest="all_categories", help="Whether to use all categories or not.") op.add_option("--filtered", action="store_true", help="Remove newsgroup information that is easily overfit: " "headers, signatures, and quoting.") # process options (opts, args) = op.parse_args(argv) print(__doc__) op.print_help() print() #Categories if opts.all_categories: categories = None else: categories = [ 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', ] # MAIN BODY # #Remove headers if opts.filtered: remove = ('headers', 'footers', 'quotes') else: remove = () print("Loading 20 newsgroups dataset for categories:") print(categories if categories else "all") data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42, remove=remove) data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42, remove=remove) print('data loaded') # order of labels in `target_names` can be different from `categories` target_names = data_train.target_names data_train_size_mb = size_mb(data_train.data) data_test_size_mb = size_mb(data_test.data) print("%d documents - %0.3fMB (training set)" % ( len(data_train.data), data_train_size_mb)) print("%d documents - %0.3fMB (test set)" % ( len(data_test.data), data_test_size_mb)) print() # Linear Classification GridSearch # - Term Vector(n-gram, tfidf or binary, normarize instanse wize(l1) or none # - Linear LinerSVC(loss=(l1 or l2) results = [] print('=' * 80) print("GridSearch LinearSVC with L2") print("TermVector(n-gram=(1 to 2), binary=(true or not), idf=(true or not), normlized=(true or not))") print("LinerSVC(loss=(l1 or l2))") text_clf = Pipeline([ ('vect', TfidfVectorizer(stop_words='english')), ('clf', LinearSVC()) ]) parameters =[ {'vect__ngram_range': [(1, 1), (1, 2)], 'vect__binary': [True, False], 'vect__use_idf': [True, False], 'vect__norm': [None, 'l2'], 'clf__loss': ['squared_hinge'], 'clf__penalty': ['l1'], 'clf__dual': [False] }, {'vect__ngram_range': [(1, 1), (1, 2)], 'vect__binary': [True, False], 'vect__use_idf': [True, False], 'vect__norm': [None, 'l2'], 'clf__loss': ['hinge'], 'clf__penalty': ['l2'], 'clf__dual': [True]}, {'vect__ngram_range': [(1, 1), (1, 2)], 'vect__binary': [True, False], 'vect__use_idf': [True, False], 'vect__norm': [None, 'l2'], 'clf__loss': ['squared_hinge'], 'clf__penalty': ['l2'], 'clf__dual': [True, False]} ] gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1,cv=10,verbose=1) results.append(benchmark(gs_clf, data_train.data, data_train.target, data_test.data, data_test.target, target_names, opts)) print("Best score: %0.3f" % gs_clf.best_score_) print("Best parameters set:") best_parameters = gs_clf.best_estimator_.get_params() for param_name in ['vect__ngram_range', 'vect__binary', 'vect__use_idf', 'vect__norm', 'clf__loss', 'clf__penalty', 'clf__dual']: print("\t%s: %r" % (param_name, best_parameters[param_name])) print("Av. score for each parameters") means = gs_clf.cv_results_['mean_test_score'] stds = gs_clf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, gs_clf.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) if __name__ == "__main__": if TESTRUN: import doctest doctest.testmod() if PROFILE: import cProfile import pstats profile_filename = '_profile.txt' cProfile.run('main()', profile_filename) statsfile = open("profile_stats.txt", "wb") p = pstats.Stats(profile_filename, stream=statsfile) stats = p.strip_dirs().sort_stats('cumulative') stats.print_stats() statsfile.close() sys.exit(0) sys.exit(main())
コマンドライン
python3 scikit_gridsearch.py --top10 --report --confusion_matrix --all_categories --filtered
結果出力
-- GridSearh for scikit learn - LinearSVC with TextData @author: mzi @copyright: 2017 mzi. All rights reserved. @license: Apache Licence 2.0 Usage: scikit_gridsearch.py [options] Copyright 2017 mzi Licensed under the Apache License 2.0 http://www.apache.org/licenses/LICENSE-2.0 Options: --version show program's version number and exit -h, --help show this help message and exit --report Print a detailed classification report. --confusion_matrix Print the confusion matrix. --top10 Print ten most discriminative terms per class for every classifier. --all_categories Whether to use all categories or not. --filtered Remove newsgroup information that is easily overfit: headers, signatures, and quoting. GridSearh for scikit learn - LinearSVC with TextData Loading 20 newsgroups dataset for categories: all data loaded 11314 documents - 13.782MB (training set) 7532 documents - 8.262MB (test set) ================================================================================ GridSearch LinearSVC with L2 TermVector(n-gram=(1 to 2), binary=(true or not), idf=(true or not), normlized=(true or not)) LinerSVC(loss=(l1 or l2)) ________________________________________________________________________________ Training: GridSearchCV(cv=10, error_score='raise', estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True, ...ax_iter=1000, multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0))]), fit_params={}, iid=True, n_jobs=-1, param_grid=[{'vect__norm': [None, 'l2'], 'vect__binary': [True, False], 'clf__penalty': ['l1'], 'vect__use_idf': [True, False], 'clf__loss': ['squared_hinge'], 'clf__dual': [False], 'vect__ngram_range': [(1, 1), (1, 2)]}, {'vect__norm': [None, 'l2'], 'vect__binary': [True, False], 'clf__penalty': ['... 'clf__loss': ['squared_hinge'], 'clf__dual': [True, False], 'vect__ngram_range': [(1, 1), (1, 2)]}], pre_dispatch='2*n_jobs', refit=True, return_train_score=True, scoring=None, verbose=1) Fitting 10 folds for each of 64 candidates, totalling 640 fits [Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 2.6min [Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 18.7min [Parallel(n_jobs=-1)]: Done 434 tasks | elapsed: 56.7min [Parallel(n_jobs=-1)]: Done 640 out of 640 | elapsed: 566.4min finished train time: 34000.658s test time: 2.164s accuracy: 0.704 dimensionality: 943737 density: 0.546218 top 10 keywords per class: alt.atheism: punishment atheist deletion islamic motto bobby atheists religio... comp.graphics: format cview polygon animation tiff images pov 3d image graphics comp.os.ms-windows.misc: ax fonts w4wg driver risc win3 ini file cica windows comp.sys.ibm.pc.hardware: pc bus irq scsi 486 vlb bios gateway ide controller comp.sys.mac.hardware: lciii c650 duo lc powerbook se centris quadra apple mac comp.windows.x: application xlib widgets mit x11r5 xterm widget window server... misc.forsale: interested 00 email make offer asking condition sell offer ship... rec.autos: wagon gt vw toyota oil engine dealer ford cars car rec.motorcycles: harley dog bmw motorcycle riding helmet ride bikes dod bike rec.sport.baseball: hit year runs alomar pitching cubs braves phillies stadiu... rec.sport.hockey: espn playoff puck season leafs playoffs game team nhl hockey sci.crypt: pgp des privacy crypto security keys key nsa clipper encryption sci.electronics: output uv dial circuits power ground 8051 voltage electronic... sci.med: medicine diet cancer patients pain treatment medical disease doctor msg sci.space: solar earth lunar shuttle spacecraft moon nasa launch orbit space soc.religion.christian: marriage faith resurrection easter scripture christ c... talk.politics.guns: batf bd firearm nra weapon fbi firearms weapons guns gun talk.politics.mideast: turkey loser arabs armenian turkish jews armenians ara... talk.politics.misc: government _too_ president taxes jobs libertarians trial ... talk.religion.misc: sure wrong rosicrucian cockroaches lunacy critus hudson o... classification report: precision recall f1-score support alt.atheism 0.53 0.47 0.50 319 comp.graphics 0.67 0.72 0.69 389 comp.os.ms-windows.misc 0.65 0.64 0.64 394 comp.sys.ibm.pc.hardware 0.67 0.67 0.67 392 comp.sys.mac.hardware 0.73 0.70 0.72 385 comp.windows.x 0.83 0.72 0.77 395 misc.forsale 0.72 0.80 0.76 390 rec.autos 0.79 0.71 0.75 396 rec.motorcycles 0.83 0.76 0.80 398 rec.sport.baseball 0.54 0.85 0.66 397 rec.sport.hockey 0.88 0.90 0.89 399 sci.crypt 0.84 0.71 0.77 396 sci.electronics 0.65 0.60 0.62 393 sci.med 0.78 0.79 0.79 396 sci.space 0.75 0.76 0.76 394 soc.religion.christian 0.65 0.82 0.72 398 talk.politics.guns 0.60 0.67 0.63 364 talk.politics.mideast 0.85 0.76 0.80 376 talk.politics.misc 0.60 0.48 0.53 310 talk.religion.misc 0.49 0.27 0.35 251 avg / total 0.71 0.70 0.70 7532 confusion matrix: [[149 1 3 1 2 2 5 4 1 12 4 2 5 6 12 60 7 12 7 24] [ 5 280 21 7 7 17 10 3 1 12 0 6 6 2 7 2 1 1 1 0] [ 2 18 251 39 16 11 5 2 2 16 1 3 0 7 11 1 0 2 4 3] [ 0 14 32 262 27 6 14 1 0 10 2 2 20 0 1 0 0 0 1 0] [ 2 5 12 27 270 3 16 3 1 15 1 2 17 4 4 0 1 0 0 2] [ 0 42 35 4 5 284 2 0 1 9 0 3 4 0 4 0 1 0 0 1] [ 0 3 2 17 16 0 313 5 2 10 1 1 8 1 3 2 1 2 2 1] [ 3 0 1 1 3 1 16 282 15 29 4 1 20 4 5 1 2 4 4 0] [ 4 3 1 2 2 0 6 18 304 20 0 0 10 5 7 3 3 0 9 1] [ 2 3 0 0 0 2 6 2 4 339 21 0 1 5 1 3 2 1 4 1] [ 1 1 0 0 1 1 0 1 2 21 359 0 1 3 1 0 5 0 1 1] [ 2 8 6 3 8 3 8 2 5 20 1 283 9 2 3 5 12 2 12 2] [ 3 10 12 25 7 7 19 8 7 17 3 12 237 10 8 2 2 1 2 1] [ 6 7 1 1 1 0 6 6 3 16 5 0 8 314 2 5 4 2 6 3] [ 5 11 3 0 2 0 3 8 3 20 2 1 10 11 301 3 4 1 5 1] [ 17 3 3 0 0 1 0 0 2 14 0 0 1 6 4 327 2 3 4 11] [ 7 3 2 1 1 2 3 5 6 14 0 13 2 5 9 7 244 9 19 12] [ 24 2 2 0 0 1 1 2 4 11 0 3 2 2 2 13 7 286 12 2] [ 14 0 0 0 0 1 1 5 2 12 3 2 3 6 11 3 87 6 148 6] [ 33 5 2 2 1 1 2 2 1 9 1 1 2 8 6 69 24 6 7 69]] Best score: 0.780 Best parameters set: vect__ngram_range: (1, 2) vect__binary: False vect__use_idf: True vect__norm: 'l2' clf__loss: 'squared_hinge' clf__penalty: 'l2' clf__dual: True Av. score for each parameters 0.682 (+/-0.024) for {'vect__ngram_range': (1, 1), 'vect__norm': None, 'vect__binary': True, 'clf__penalty': 'l1', 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__use_idf': True} 0.691 (+/-0.029) for {'vect__ngram_range': (1, 1), 'vect__norm': None, 'vect__binary': True, 'clf__penalty': 'l1', 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__use_idf': False} 0.723 (+/-0.021) for {'vect__ngram_range': (1, 1), 'vect__norm': 'l2', 'vect__binary': True, 'clf__penalty': 'l1', 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__use_idf': True} 0.708 (+/-0.023) for {'vect__ngram_range': (1, 1), 'vect__norm': 'l2', 'vect__binary': True, 'clf__penalty': 'l1', 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__use_idf': False} 0.702 (+/-0.026) for {'vect__ngram_range': (1, 2), 'vect__binary': True, 'clf__penalty': 'l1', 'vect__use_idf': True, 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__norm': None} 0.699 (+/-0.020) for {'vect__norm': None, 'vect__binary': True, 'clf__penalty': 'l1', 'vect__use_idf': False, 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__ngram_range': (1, 2)} 0.703 (+/-0.024) for {'vect__ngram_range': (1, 2), 'vect__norm': 'l2', 'vect__binary': True, 'clf__penalty': 'l1', 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__use_idf': True} 0.700 (+/-0.026) for {'vect__ngram_range': (1, 2), 'vect__norm': 'l2', 'vect__binary': True, 'clf__penalty': 'l1', 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__use_idf': False} 0.673 (+/-0.027) for {'vect__ngram_range': (1, 1), 'vect__norm': None, 'vect__binary': False, 'clf__penalty': 'l1', 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__use_idf': True} 0.693 (+/-0.022) for {'vect__norm': None, 'vect__binary': False, 'clf__penalty': 'l1', 'vect__use_idf': False, 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__ngram_range': (1, 1)} 0.730 (+/-0.025) for {'vect__ngram_range': (1, 1), 'vect__norm': 'l2', 'vect__binary': False, 'clf__penalty': 'l1', 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__use_idf': True} 0.717 (+/-0.026) for {'vect__norm': 'l2', 'vect__binary': False, 'clf__penalty': 'l1', 'vect__use_idf': False, 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__ngram_range': (1, 1)} 0.698 (+/-0.023) for {'vect__ngram_range': (1, 2), 'vect__binary': False, 'clf__penalty': 'l1', 'vect__use_idf': True, 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__norm': None} 0.695 (+/-0.031) for {'vect__ngram_range': (1, 2), 'vect__norm': None, 'vect__binary': False, 'clf__penalty': 'l1', 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__use_idf': False} 0.716 (+/-0.022) for {'vect__ngram_range': (1, 2), 'vect__norm': 'l2', 'vect__binary': False, 'clf__penalty': 'l1', 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__use_idf': True} 0.710 (+/-0.025) for {'vect__norm': 'l2', 'vect__binary': False, 'clf__penalty': 'l1', 'vect__use_idf': False, 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__ngram_range': (1, 2)} 0.610 (+/-0.022) for {'vect__ngram_range': (1, 1), 'vect__binary': True, 'clf__penalty': 'l2', 'vect__use_idf': True, 'clf__loss': 'hinge', 'clf__dual': True, 'vect__norm': None} 0.664 (+/-0.024) for {'vect__norm': None, 'vect__binary': True, 'clf__penalty': 'l2', 'vect__use_idf': False, 'clf__loss': 'hinge', 'clf__dual': True, 'vect__ngram_range': (1, 1)} 0.768 (+/-0.022) for {'vect__norm': 'l2', 'vect__binary': True, 'clf__penalty': 'l2', 'vect__use_idf': True, 'clf__loss': 'hinge', 'clf__dual': True, 'vect__ngram_range': (1, 1)} 0.739 (+/-0.025) for {'vect__norm': 'l2', 'vect__binary': True, 'clf__penalty': 'l2', 'vect__use_idf': False, 'clf__loss': 'hinge', 'clf__dual': True, 'vect__ngram_range': (1, 1)} 0.585 (+/-0.033) for {'vect__ngram_range': (1, 2), 'vect__binary': True, 'clf__penalty': 'l2', 'vect__use_idf': True, 'clf__loss': 'hinge', 'clf__dual': True, 'vect__norm': None} 0.680 (+/-0.032) for {'vect__norm': None, 'vect__binary': True, 'clf__penalty': 'l2', 'vect__use_idf': False, 'clf__loss': 'hinge', 'clf__dual': True, 'vect__ngram_range': (1, 2)} 0.773 (+/-0.021) for {'vect__norm': 'l2', 'vect__binary': True, 'clf__penalty': 'l2', 'vect__use_idf': True, 'clf__loss': 'hinge', 'clf__dual': True, 'vect__ngram_range': (1, 2)} 0.748 (+/-0.026) for {'vect__ngram_range': (1, 2), 'vect__norm': 'l2', 'vect__binary': True, 'clf__penalty': 'l2', 'clf__loss': 'hinge', 'clf__dual': True, 'vect__use_idf': False} 0.599 (+/-0.028) for {'vect__norm': None, 'vect__binary': False, 'clf__penalty': 'l2', 'vect__use_idf': True, 'clf__loss': 'hinge', 'clf__dual': True, 'vect__ngram_range': (1, 1)} 0.655 (+/-0.025) for {'vect__norm': None, 'vect__binary': False, 'clf__penalty': 'l2', 'vect__use_idf': False, 'clf__loss': 'hinge', 'clf__dual': True, 'vect__ngram_range': (1, 1)} 0.770 (+/-0.020) for {'vect__ngram_range': (1, 1), 'vect__norm': 'l2', 'vect__binary': False, 'clf__penalty': 'l2', 'clf__loss': 'hinge', 'clf__dual': True, 'vect__use_idf': True} 0.743 (+/-0.025) for {'vect__ngram_range': (1, 1), 'vect__binary': False, 'clf__penalty': 'l2', 'vect__use_idf': False, 'clf__loss': 'hinge', 'clf__dual': True, 'vect__norm': 'l2'} 0.600 (+/-0.026) for {'vect__ngram_range': (1, 2), 'vect__norm': None, 'vect__binary': False, 'clf__penalty': 'l2', 'clf__loss': 'hinge', 'clf__dual': True, 'vect__use_idf': True} 0.679 (+/-0.027) for {'vect__norm': None, 'vect__binary': False, 'clf__penalty': 'l2', 'vect__use_idf': False, 'clf__loss': 'hinge', 'clf__dual': True, 'vect__ngram_range': (1, 2)} 0.777 (+/-0.024) for {'vect__ngram_range': (1, 2), 'vect__norm': 'l2', 'vect__binary': False, 'clf__penalty': 'l2', 'clf__loss': 'hinge', 'clf__dual': True, 'vect__use_idf': True} 0.752 (+/-0.023) for {'vect__ngram_range': (1, 2), 'vect__norm': 'l2', 'vect__binary': False, 'clf__penalty': 'l2', 'clf__loss': 'hinge', 'clf__dual': True, 'vect__use_idf': False} 0.619 (+/-0.024) for {'vect__ngram_range': (1, 1), 'vect__norm': None, 'vect__binary': True, 'clf__penalty': 'l2', 'clf__loss': 'squared_hinge', 'clf__dual': True, 'vect__use_idf': True} 0.670 (+/-0.025) for {'vect__ngram_range': (1, 1), 'vect__binary': True, 'clf__penalty': 'l2', 'vect__use_idf': False, 'clf__loss': 'squared_hinge', 'clf__dual': True, 'vect__norm': None} 0.768 (+/-0.027) for {'vect__norm': 'l2', 'vect__binary': True, 'clf__penalty': 'l2', 'vect__use_idf': True, 'clf__loss': 'squared_hinge', 'clf__dual': True, 'vect__ngram_range': (1, 1)} 0.743 (+/-0.028) for {'vect__ngram_range': (1, 1), 'vect__norm': 'l2', 'vect__binary': True, 'clf__penalty': 'l2', 'clf__loss': 'squared_hinge', 'clf__dual': True, 'vect__use_idf': False} 0.594 (+/-0.030) for {'vect__norm': None, 'vect__binary': True, 'clf__penalty': 'l2', 'vect__use_idf': True, 'clf__loss': 'squared_hinge', 'clf__dual': True, 'vect__ngram_range': (1, 2)} 0.680 (+/-0.022) for {'vect__ngram_range': (1, 2), 'vect__norm': None, 'vect__binary': True, 'clf__penalty': 'l2', 'clf__loss': 'squared_hinge', 'clf__dual': True, 'vect__use_idf': False} 0.775 (+/-0.023) for {'vect__ngram_range': (1, 2), 'vect__binary': True, 'clf__penalty': 'l2', 'vect__use_idf': True, 'clf__loss': 'squared_hinge', 'clf__dual': True, 'vect__norm': 'l2'} 0.747 (+/-0.025) for {'vect__ngram_range': (1, 2), 'vect__norm': 'l2', 'vect__binary': True, 'clf__penalty': 'l2', 'clf__loss': 'squared_hinge', 'clf__dual': True, 'vect__use_idf': False} 0.605 (+/-0.024) for {'vect__ngram_range': (1, 1), 'vect__norm': None, 'vect__binary': False, 'clf__penalty': 'l2', 'clf__loss': 'squared_hinge', 'clf__dual': True, 'vect__use_idf': True} 0.664 (+/-0.025) for {'vect__ngram_range': (1, 1), 'vect__norm': None, 'vect__binary': False, 'clf__penalty': 'l2', 'clf__loss': 'squared_hinge', 'clf__dual': True, 'vect__use_idf': False} 0.769 (+/-0.021) for {'vect__ngram_range': (1, 1), 'vect__binary': False, 'clf__penalty': 'l2', 'vect__use_idf': True, 'clf__loss': 'squared_hinge', 'clf__dual': True, 'vect__norm': 'l2'} 0.745 (+/-0.026) for {'vect__norm': 'l2', 'vect__binary': False, 'clf__penalty': 'l2', 'vect__use_idf': False, 'clf__loss': 'squared_hinge', 'clf__dual': True, 'vect__ngram_range': (1, 1)} 0.605 (+/-0.024) for {'vect__ngram_range': (1, 2), 'vect__norm': None, 'vect__binary': False, 'clf__penalty': 'l2', 'clf__loss': 'squared_hinge', 'clf__dual': True, 'vect__use_idf': True} 0.680 (+/-0.025) for {'vect__ngram_range': (1, 2), 'vect__norm': None, 'vect__binary': False, 'clf__penalty': 'l2', 'clf__loss': 'squared_hinge', 'clf__dual': True, 'vect__use_idf': False} 0.780 (+/-0.021) for {'vect__ngram_range': (1, 2), 'vect__binary': False, 'clf__penalty': 'l2', 'vect__use_idf': True, 'clf__loss': 'squared_hinge', 'clf__dual': True, 'vect__norm': 'l2'} 0.756 (+/-0.027) for {'vect__norm': 'l2', 'vect__binary': False, 'clf__penalty': 'l2', 'vect__use_idf': False, 'clf__loss': 'squared_hinge', 'clf__dual': True, 'vect__ngram_range': (1, 2)} 0.628 (+/-0.027) for {'vect__norm': None, 'vect__binary': True, 'clf__penalty': 'l2', 'vect__use_idf': True, 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__ngram_range': (1, 1)} 0.670 (+/-0.025) for {'vect__ngram_range': (1, 1), 'vect__norm': None, 'vect__binary': True, 'clf__penalty': 'l2', 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__use_idf': False} 0.768 (+/-0.027) for {'vect__norm': 'l2', 'vect__binary': True, 'clf__penalty': 'l2', 'vect__use_idf': True, 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__ngram_range': (1, 1)} 0.743 (+/-0.028) for {'vect__norm': 'l2', 'vect__binary': True, 'clf__penalty': 'l2', 'vect__use_idf': False, 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__ngram_range': (1, 1)} 0.614 (+/-0.025) for {'vect__norm': None, 'vect__binary': True, 'clf__penalty': 'l2', 'vect__use_idf': True, 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__ngram_range': (1, 2)} 0.681 (+/-0.023) for {'vect__ngram_range': (1, 2), 'vect__norm': None, 'vect__binary': True, 'clf__penalty': 'l2', 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__use_idf': False} 0.775 (+/-0.023) for {'vect__norm': 'l2', 'vect__binary': True, 'clf__penalty': 'l2', 'vect__use_idf': True, 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__ngram_range': (1, 2)} 0.747 (+/-0.025) for {'vect__ngram_range': (1, 2), 'vect__norm': 'l2', 'vect__binary': True, 'clf__penalty': 'l2', 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__use_idf': False} 0.679 (+/-0.027) for {'vect__ngram_range': (1, 1), 'vect__norm': None, 'vect__binary': False, 'clf__penalty': 'l2', 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__use_idf': True} 0.676 (+/-0.026) for {'vect__norm': None, 'vect__binary': False, 'clf__penalty': 'l2', 'vect__use_idf': False, 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__ngram_range': (1, 1)} 0.769 (+/-0.021) for {'vect__ngram_range': (1, 1), 'vect__norm': 'l2', 'vect__binary': False, 'clf__penalty': 'l2', 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__use_idf': True} 0.745 (+/-0.026) for {'vect__ngram_range': (1, 1), 'vect__norm': 'l2', 'vect__binary': False, 'clf__penalty': 'l2', 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__use_idf': False} 0.694 (+/-0.018) for {'vect__norm': None, 'vect__binary': False, 'clf__penalty': 'l2', 'vect__use_idf': True, 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__ngram_range': (1, 2)} 0.690 (+/-0.027) for {'vect__norm': None, 'vect__binary': False, 'clf__penalty': 'l2', 'vect__use_idf': False, 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__ngram_range': (1, 2)} 0.779 (+/-0.022) for {'vect__ngram_range': (1, 2), 'vect__norm': 'l2', 'vect__binary': False, 'clf__penalty': 'l2', 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__use_idf': True} 0.756 (+/-0.027) for {'vect__ngram_range': (1, 2), 'vect__norm': 'l2', 'vect__binary': False, 'clf__penalty': 'l2', 'clf__loss': 'squared_hinge', 'clf__dual': False, 'vect__use_idf': False}