#python version 3.6 #DGA feature building #entropy def entropy(string): #get probability of chars in string prob = [ float(string.count(c)) / len(string) for c in dict.fromkeys(list(string)) ] #calculate the entropy entropy = - sum([p * math.log(p) / math.log(2.0) for p in prob]) return entropy #apply entropy to the domain df['entropy'] = df['domain'].apply(entropy) #Additional features #hyphen count df['hyphen_count'] = df.domain.str.count('-') #dot count df['dot_count'] = df.domain.str.count(r'\.') #string length of the full domain df['string_len_domain'] = df.domain.str.len() #tld length df['tld_len'] = df.tld.str.len() #count of vowels and consonants vowels = set("aeiou") cons = set("bcdfghjklmnpqrstvwxyz") df['Vowels'] = [sum(1 for c in x if c in vowels) for x in df['domain']] df['Consonants'] = [sum(1 for c in x if c in cons) for x in df['domain']] #consonents to vowels ratio df['consec_vowel_ratio'] = (df['Vowels'] / df['Consonents']).round(5) #count the number of syllables in a word def syllables(word): word = word.lower() if word.endswith('e'): word = word[:-1] count = len(re.findall('[aeiou]+', word)) return count df['syllables'] = df['domain'].apply(syllables) #prediction code from xgboost import XGBClassifier pred = pd.DataFrame(df.data, columns = columns) # load the dataset as a pandas data frame y = df.benign_dga # the binary target variable 1 for DGA 0 for benign. This was assigned in the data collection #create training and testing sets X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3) #fit model model = XGBClassifier(objective= 'binary:logistic') model.fit(X_train, y_train) #make predictions for test data y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0))