# coding: utf-8

import sys
import numpy
from sklearn import metrics
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import  CountVectorizer,TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import codecs
'''
train_words = [
     '我们 我们 长城 故宫 。 就是 中国',
     '我们 好孩子 长城 。 中国',
     '我们 好孩子 泡菜 。 孤独 韩国',
     '我们 泡菜 认证 。 韩国',
]

train_tags = ['中国','中国','韩国','韩国']

test_words = [
     '我 泡菜 韩国 好吃',
     '长城 好孩子 认证 。 中国',
 ]

test_tags = ['中国','韩国']
'''

train_words = [
     '长城 故宫 就是 中国',
     '我们 中国',
     '我们 好孩子 泡菜 。 孤独 韩国',    
]

train_tags = ['中国','中国','韩国']

test_words = [
     '我 泡菜 韩国 好吃',     
 ]

test_tags = ['韩国']

train_words, train_tags, test_words, test_tags

def vectorize(train_words, test_words):
    #v = HashingVectorizer(n_features=10, non_negative=True)
    v = HashingVectorizer(non_negative=True)
    #v = CountVectorizer(min_df=1)
    train_data = v.fit_transform(train_words)
    test_data = v.fit_transform(test_words)
    #print v.
    return train_data, test_data

'''
tf_idf
'''
def vectorize1(train_words, test_words):
    tv = TfidfVectorizer(sublinear_tf = True,max_df = 0.5);
    tv = TfidfVectorizer(sublinear_tf = True);
    train_data = tv.fit_transform(train_words);
    tv2 = TfidfVectorizer(vocabulary = tv.vocabulary_);
    test_data = tv2.fit_transform(test_words);
    print ' '.join(tv2.get_feature_names())
    return train_data, test_data

'''
tf_idf
'''
def vectorize2(train_words, test_words):
    count_v1= CountVectorizer(stop_words = 'english', max_df = 0.5);  
    counts_train = count_v1.fit_transform(train_words);  
    
    count_v2 = CountVectorizer(vocabulary=count_v1.vocabulary_);  
    counts_test = count_v2.fit_transform(test_words);  
    
    tfidftransformer = TfidfTransformer();  
    
    train_data = tfidftransformer.fit(counts_train).transform(counts_train);  
    test_data = tfidftransformer.fit(counts_test).transform(counts_test);
    return train_data, test_data



def main():
    #print len(train_words), len(train_tags), len(test_words), len(test_words),
    
    train_data, test_data = vectorize1(train_words, test_words)
    print type(train_data)
    print train_data.toarray()
    print train_data.shape
    
    print test_data.toarray()
    print test_data.shape
    '''
    count_v1= CountVectorizer(stop_words = 'english', max_df = 0.5);
    counts_train = count_v1.fit_transform(train_words);
    print counts_train
    '''
    
    
if __name__ == '__main__':
    main()