Source code for digitaldna.lcs

"""
This is a module to be used as a reference for building other modules
"""
import warnings
from glcr import longest_common_subsequence
from multiprocessing import Process

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_array

from .verbosity import Verbosity


# noinspection PyIncorrectDocstring
[docs]class LongestCommonSubsequence(BaseEstimator): """ The Digital DNA Python implementation. Parameters ---------- in_path : str, optional The name with absolute path of a file containing the sequences you want to process. The input file must be a txt file and the first row must contain the number of sequences to read. Default: '' out_path : str, optional The output file name with absolute path of the file where the algorithm will save result in case of verbosity equals to `Verbosity.FILE` or `Verbosity.FILE_EXTENDED`. Default: '/tmp/glcr_cache' overwrite : boolean, optional It must be False to use the LCS files produced in a previous fit call, in this case the file names are the ones specified in the out_path parameter. If True, recomputes the LCS files. Default: False threshold : boolean, optional It must be False to use the LCS files produced in a previous fit call, in this case the file names are the ones specified in the out_path parameter. If True, recomputes the LCS files. Default: False window : str, optional The size of the window used to compute the cutting threshold between bot and not bot. The cutting point is computed by smoothing the curve, deriving the result and taking the first (l.h.s.) local maxima. The window parameter influences both smoothing and finding the local maxima. It must be 2 < window < n_accounts. Default: 10 verbosity : str, optional The verbosity parameter is used to specify whether to save results to files or not. It must be:\n - TEST does not write anything, used for benchmarking - MEMORY_ONLY retrieves only the couples (sequence length, # of accounts), used for plots - FILE produces 2 files, a file named out_path + '.gsa' where each row contains the identifier of the sequence. In the other file, named out_path + '.mat', each row contains:\n - sequence length - # of accounts - range of indexes (begin and end) - FILE_EXTENDED as FILE but the in_path + '.mat' file contains also the column of the common subsequence References ---------- S. Cresci, R. D. Pietro, M. Petrocchi, A. Spognardi and M. Tesconi, "Social Fingerprinting: Detection of Spambot Groups Through DNA-Inspired Behavioral Modeling", IEEE Transactions on Dependable and Secure Computing, vol. 15, no. 4, pp. 561-576, 1 July-Aug. 2018, https://ieeexplore.ieee.org/document/7876716 S. Cresci, R. di Pietro, M. Petrocchi, A. Spognardi and M. Tesconi, "Exploiting Digital DNA for the Analysis of Similarities in Twitter Behaviours", 2017 IEEE International Conference on Data Science and Advanced Analytics (DSAA), Tokyo, 2017, pp. 686-695, https://ieeexplore.ieee.org/document/8259831 M. Arnold, E. Ohlebusch, "Linear Time Algorithms for Generalizations of the Longest Common Substring Problem", Algorithmica, vol 60, pp. 806-818, 4 August 2011, https://link.springer.com/article/10.1007/s00453-009-9369-1 """
[docs] def __init__(self, in_path='', out_path='/tmp/glcr_cache', overwrite=False, threshold='auto', window=10, verbosity=Verbosity.FILE): self.in_path = in_path self.out_path = out_path self.overwrite = overwrite self.threshold = threshold self.window = window if self.window < 2: raise ValueError('window parameter cannot be less than 2.') self.verbosity = verbosity
[docs] def fit(self, X, y=None): """Computes the longest common subsequence Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The training input samples. y : None There is no need of a target in a transformer, yet the pipeline API requires this parameter. Attributes ---------- lcs_index_ : pandas dataframe, shape (distinct couples (lcs, n_of_accounts), 2), default=None The dataframe containing the distict couples lcs, n_of_accounts). Only if: verbosity == Verbosity.MEMORY_ONLY Returns ------- self : object Returns self. """ warnings.simplefilter(action='ignore', category=FutureWarning) self.gsa_ = '.gsa' self.mat_ = '.mat' if self.in_path == '': X = self._unicode_to_ascii(X) X = check_array(X, ensure_2d=False) print("fitting...") if self.verbosity == Verbosity.MEMORY_ONLY: self.lcs_ = longest_common_subsequence(self._unicode_to_ascii(X), self.in_path, self.out_path, self.verbosity) elif self.verbosity > Verbosity.MEMORY_ONLY: if self.in_path == '' or self.overwrite: p = Process(target=longest_common_subsequence, args=(X, self.in_path, self.out_path, self.verbosity)) p.start() p.join() return self
[docs] def predict(self, X=None): """ Predict the labels (True bot, False Not Bot) of X according to lcs and window parameter:. If X is None, returns the same as fit_predict(X_train). Parameters ---------- X : array-like, shape (n_samples, n_features), default=None The query sample or samples to identify bot groups. If None, makes prediction on the training data. Returns ------- y : array, shape (n_samples,) Returns True for bots and False for real timeline. """ y = None if self.verbosity > Verbosity.MEMORY_ONLY: y = np.full(len(X), False) mat_ix = set() if self.threshold == 'auto': print('finding cut...') lengths = pd.read_csv(self.out_path+self.mat_, usecols=['length']).length.values self.cut_ = self._decision_function(lengths) else: self.cut_ = self.threshold print('predicting...') class BreakIt(Exception): pass try: for df in pd.read_csv(self.out_path + self.mat_, chunksize=500000): for ix, row in df.iterrows(): curr_set = mat_ix.union(set(range(int(row.begin), int(row.end) + 1))) if row.num_texts >= self.cut_: # not bot, exit raise BreakIt mat_ix = curr_set except BreakIt: pass ix_size = len(mat_ix) ix_count = 0 bot_ix = np.empty(ix_size, dtype=int) try: for df in pd.read_csv(self.out_path + self.gsa_, header=0, usecols=['wordindex'], squeeze=True, chunksize=500000): for ix, wordindex in df.items(): if ix in mat_ix: # TODO delete the -1 when fix the gsa file bot_ix[ix_count] = wordindex - 1 ix_count += 1 if ix_count == ix_size: raise BreakIt mat_ix.remove(ix) except BreakIt: y[bot_ix] = True else: warnings.warn("Cannot predict with verbosity level lower than FILE", Warning) print('done.') return y
[docs] def fit_predict(self, X, y=None): """"Fits the model to the training set X and returns the labels (True for bot, False for non bot) on the training set according to the window parameter. Parameters ---------- X : array-like, shape (n_samples, n_features), default=None The query sample or samples to compute the Local Outlier Factor w.r.t. to the training samples. Attributes ---------- lcs_index_ : pandas dataframe, shape (distinct couples (lcs, n_of_accounts), 2), default=None The dataframe containing the distict couples lcs, n_of_accounts) Returns ------- y : array, shape (n_samples,) Returns True for bots and False for real timeline. """ return self.fit(X).predict(X)
[docs] def plot_LCS(self): """"Plots the longest common subsequence curve as (number of accounts, sequence length) Attributes ---------- lcs_index_ : pandas dataframe, shape (distinct couples (lcs, n_of_accounts), 2), default=None The dataframe containing the distict couples lcs, n_of_accounts) Returns ------- self : returns an instance of self. """ plt.xlabel('# of accounts') plt.ylabel('LCS') if self.verbosity > Verbosity.MEMORY_ONLY and not hasattr(self, 'lcs_'): self.lcs_ = pd.read_csv(self.out_path + self.mat_, usecols=['length', 'num_texts']) \ .drop_duplicates().reset_index().drop(['index'], axis=1) plt.plot(self.lcs_.num_texts, self.lcs_.length, marker='x') if hasattr(self, 'cut_'): plt.plot([self.cut_, self.cut_], [0, max(self.lcs_.length)], linestyle='--') plt.show() return self
[docs] def plot_LCS_log(self): """"Plots the longest common subsequence curve as (log(number of accounts), log(sequence length)) Attributes ---------- lcs_index_ : pandas dataframe, shape (distinct couples (lcs, n_of_accounts), 2), default=None The dataframe containing the distict couples lcs, n_of_accounts) Returns ------- self : returns an instance of self. """ plt.xlabel('log(# of accounts)') plt.ylabel('log(LCS)') if self.verbosity > Verbosity.MEMORY_ONLY and not hasattr(self, 'lcs_'): self.lcs_ = pd.read_csv(self.out_path + self.mat_, usecols=['length', 'num_texts']) \ .drop_duplicates().reset_index().drop(['index'], axis=1) plt.loglog(self.lcs_.num_texts, self.lcs_.length, marker='x') if hasattr(self, 'cut_'): plt.plot([self.cut_, self.cut_], [0, max(self.lcs_.length)], linestyle='--') plt.show() return self
def _decision_function(self, X): ''' Finds the first relative maximum on the smoothed LCS vector''' length = len(X) w_size = self.window X_avg_diff = np.diff(self._running_mean(X, w_size)) i = 0 max_index = length - w_size for i in range(max_index): if X_avg_diff[i] < np.mean(X_avg_diff[i:i+w_size]): break t = np.argmax(X_avg_diff[i:]) return t + i - 1 def _running_mean(self, x, winsize): for i in range(len(x)): x[i] = x[max(0, i):min(i + winsize, len(x))].mean() return x def _unicode_to_ascii(self, array): def f(item): return str(item) + '\0' def v(x): return np.vectorize(f)(x) return v(array).astype('S')