Source code for digitaldna.twitter_sequencer

"""
Digital DNA Sequencer for Twitter
"""
import json
import numpy as np

ENTITY = 'E'
HASHTAG = 'T'
MEDIA = 'G'
MENTION = 'C'
NONE = 'N'
MIXED = 'X'
REPLY = 'T'
RETWEET = 'C'
TWEET = 'A'
URL = 'A'
UNKNOWN = 'U'


[docs]class TwitterDDNASequencer(): """ Twitter Digital DNA Sequencer. Compute sequences of digital DNA from twitter timelines (check out https://developer.twitter.com/en/docs/tweets/timelines/api-reference/get-statuses-user_timeline.html) Parameters ---------- alphabet : string or callable, default ‘b3_type’ mapping between the column value and the corresponding base. If alphabet_ is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays as input and return one value indicating the distance between them. Prebuild alphabets are the following: - 'b3_type', where the correspondence is - 'A' for tweet - 'C' for reply - 'T' for retweet - 'b3_content', where the correspondence is - 'N' tweet contains no entities (plain text) - 'E' tweet contains entities of one type - 'X' tweet contains entities of mixed types - 'b6_content', where the correspondence is - 'N' tweet contains no entities (plain text) - 'U' tweet contains one or more URLs - 'H' tweet contains one or more hashtags - 'M' tweet contains one or more mentions - 'D' tweet contains one or more medias - 'X' tweet contains entities of mixed types Attributes ---------- input_shape : tuple The shape the data passed to :meth:`fit` References ---------- S. Cresci, R. D. Pietro, M. Petrocchi, A. Spognardi and M. Tesconi, "Social Fingerprinting: Detection of Spambot Groups Through DNA-Inspired Behavioral Modeling", IEEE Transactions on Dependable and Secure Computing, vol. 15, no. 4, pp. 561-576, 1 July-Aug. 2018, https://ieeexplore.ieee.org/document/7876716 S. Cresci, R. di Pietro, M. Petrocchi, A. Spognardi and M. Tesconi, "Exploiting Digital DNA for the Analysis of Similarities in Twitter Behaviours", 2017 IEEE International Conference on Data Science and Advanced Analytics (DSAA), Tokyo, 2017, pp. 686-695, https://ieeexplore.ieee.org/document/8259831 """
[docs] def __init__(self, alphabet='b3_type', input_file=''): import warnings warnings.simplefilter(action='ignore', category=FutureWarning) self.alphabet = alphabet self.input_file = input_file
[docs] def fit(self, X=None, y=None): """Simply assigns the right alphabet mapper function to remap_ Parameters ---------- X : None The pipeline API requires this parameter. y : None There is no need of a target in a transformer, yet the pipeline API requires this parameter. Attributes ---------- remap_ : function that takes a tweet with the same format retrieved from GET statuses/user_timeline call and retrieves the corresponding character given the alphabet parameter. Returns ------- self : object Returns an instance of self. """ if self.alphabet == 'b3_type': self.remap_ = self._tweet2char_b3_type elif self.alphabet == 'b3_content': self.remap_ = self._tweet2char_b3_content elif self.alphabet == 'b6_content': self.remap_ = self._tweet2char_b6_content else: self.remap_ = None return self
[docs] def transform(self, X=None): """ The function that transform the array of timelines to digital dna sequences given the alphabet. Parameters ---------- X : array-like of shape = [# of tweets, 1] The input samples, each sample is a python dict of a tweet as retrieved from twitter user timelines API (check out https://developer.twitter.com/en/docs/tweets/timelines/api-reference/get-statuses-user_timeline.html) Returns ------- X_transformed : array of string of shape = [# of users, 1] The array containing the digital dna sequences """ if self.input_file != '': f = open(self.input_file, "r") X = json.loads(f.read()) elif X == None: raise ValueError('X cannot be None if input_file is not specified') ddna_size = 0 res = {} for row in X: uid = self._nested_get(row, 'user.id') code = self.remap_(row) if uid in res: res[uid] += code else: res[uid] = code ddna_size = max(ddna_size, len(res[uid])) # dtype = [('uid', 'i8'), ('ddna', 'U' + str(ddna_size))] res = np.array(list(res.items())) # , dtype=dtype) return res
[docs] def fit_transform(self, X=None, y=None): """ Fit and transform Parameters ---------- X : array-like of shape = [# of tweets, 1] The input samples, each sample is a python dict of a tweet as retrieved from twitter user timelines API (check out https://developer.twitter.com/en/docs/tweets/timelines/api-reference/get-statuses-user_timeline.html) y: ignored parameter needed to mantain a standard pattern Returns ------- X_transformed : array of shape = [n_samples, 2] The resulting array where the first column is the user id and the second is the translated sequence """ return self.fit(X, y).transform(X)
[docs] def get_params(self): """Get parameters for this estimator. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ return {'alphabet': self.alphabet, 'input_file': self.input_file}
[docs] def set_params(self, alphabet, input_file): """Set the parameters of this estimator. The method works on simple estimators as well as on nested objects (such as pipelines). The latter have parameters of the form ``<component>__<parameter>`` so that it's possible to update each component of a nested object. Returns ------- self """ self.alphabet = alphabet self.input_file = input_file
def _nested_get(self, dct, keys): for key in keys.split('.'): try: dct = dct[key] except KeyError: return None return dct def _tweet2char_b3_type(self, tweet): reply_id = tweet['in_reply_to_user_id'] is_reply = reply_id != 0 and reply_id is not None retweet_id = self._nested_get(tweet, 'retweeted_status.id') is_retweet = retweet_id != 0 and retweet_id is not None if not is_reply and not is_retweet: return TWEET elif is_reply and not is_retweet: return REPLY return RETWEET def _tweet2char_b3_content(self, tweet): # sums not empty lists within entities dict n_entities = sum([1 for k, v in tweet['entities'].items() if v]) if n_entities == 0: return NONE elif n_entities > 1: return MIXED return ENTITY def _tweet2char_b6_content(self, tweet): # sums not empty lists within entities dict n_entities = sum([1 for k, v in tweet['entities'].items() if v]) if n_entities == 0: return NONE elif n_entities > 1: return MIXED else: type = [k for k, v in tweet['entities'].items() if v][0] if type == 'url': return URL elif type == 'hashtags': return HASHTAG elif type == 'user_mentions': return MENTION elif 'extended_entities' in tweet: return MEDIA return UNKNOWN