import re
import numpy as np
from sklearn import cross_validation
from sklearn import datasets
from sklearn import svm
from sklearn.externals import  joblib
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn import linear_model 
from sklearn.naive_bayes import GaussianNB

from sklearn import cross_validation
from sklearn import tree

x = []
y = []

def get_len(url):
    return len(url)

def get_url_count(url):
    if re.search('(http://)|(https://)', url, re.IGNORECASE) :
        return 1
    else:
        return 0

def get_evil_char(url):
    return len(re.findall("[<>,\'\"/]", url, re.IGNORECASE))

def get_evil_word(url):
    return len(re.findall("(alert)|(script=)(%3c)|(%3e)|(%20)|(onerror)|(onload)|(eval)|(src=)|(prompt)",url,re.IGNORECASE))

def get_last_char(url):
    if re.search('/$', url, re.IGNORECASE) :
        return 1
    else:
        return 0

def get_feature(url):
    return [get_len(url),get_url_count(url),get_evil_char(url),get_evil_word(url),get_last_char(url)]


def etl(filename,data,isxss):
        with open(filename) as f:
            for line in f:
                f1=get_len(line)
                f2=get_url_count(line)
                f3=get_evil_char(line)
                f4=get_evil_word(line)
                data.append([f1,f2,f3,f4])
                if isxss:
                    y.append(1)
                else:
                    y.append(0)
        return data

etl('../data/xss-200000.txt',x,1)
etl('../data/good-xss-200000.txt',x,0)

clf = tree.DecisionTreeClassifier()
clf2 = svm.SVC(kernel='linear', C=1)
clf3 = linear_model.LogisticRegression(C=1e5)
clf4 = GaussianNB()

for name,clf in {"decision tree":clf, "svm train long time":clf2, "LR":clf3, "bayes":clf4}.iteritems():
#for name,clf in {"decision tree":clf, "LR":clf3, "bayes":clf4}.iteritems():
    print "model:", name
    score=cross_validation.cross_val_score(clf, x, y, n_jobs=-1, cv=5)
    print score
    print  np.mean(score)