Source code for digitaldna.twitter_sequencer

"""
Digital DNA Sequencer for Twitter
"""
import json
import numpy as np

ENTITY = 'E'
HASHTAG = 'T'
MEDIA = 'G'
MENTION = 'C'
NONE = 'N'
MIXED = 'X'
REPLY = 'T'
RETWEET = 'C'
TWEET = 'A'
URL = 'A'
UNKNOWN = 'U'


[docs]class TwitterDDNASequencer():
    """ Twitter Digital DNA Sequencer.
    Compute sequences of digital DNA from twitter timelines (check out
    https://developer.twitter.com/en/docs/tweets/timelines/api-reference/get-statuses-user_timeline.html)

    Parameters
    ----------
    alphabet :  string or callable, default ‘b3_type’
        mapping between the column value and the corresponding base.
        If alphabet_ is a callable function, it is called on each
        pair of instances (rows) and the resulting value recorded.
        The callable should take two arrays as input and return
        one value indicating the distance between them.
        Prebuild alphabets are the following:

        - 'b3_type',  where the correspondence  is
                - 'A' for tweet
                - 'C' for reply
                - 'T' for retweet
        - 'b3_content', where the correspondence  is
                - 'N' tweet contains no entities (plain text)
                - 'E' tweet contains entities of one type
                - 'X' tweet contains entities of mixed types
        - 'b6_content', where the correspondence  is
                - 'N' tweet contains no entities (plain text)
                - 'U' tweet contains one or more URLs
                - 'H' tweet contains one or more hashtags
                - 'M' tweet contains one or more mentions
                - 'D' tweet contains one or more medias
                - 'X' tweet contains entities of mixed types

    Attributes
    ----------
    input_shape : tuple
        The shape the data passed to :meth:`fit`


    References
    ----------
    S. Cresci, R. D. Pietro, M. Petrocchi, A. Spognardi and M. Tesconi,
    "Social Fingerprinting: Detection of Spambot Groups Through DNA-Inspired Behavioral Modeling",
    IEEE Transactions on Dependable and Secure Computing, vol. 15, no. 4, pp. 561-576, 1 July-Aug. 2018,
    https://ieeexplore.ieee.org/document/7876716

    S. Cresci, R. di Pietro, M. Petrocchi, A. Spognardi and M. Tesconi,
    "Exploiting Digital DNA for the Analysis of Similarities in Twitter Behaviours",
    2017 IEEE International Conference on Data Science and Advanced Analytics (DSAA),
    Tokyo, 2017, pp. 686-695, https://ieeexplore.ieee.org/document/8259831
    """

[docs]    def __init__(self, alphabet='b3_type', input_file=''):
        import warnings
        warnings.simplefilter(action='ignore', category=FutureWarning)
        self.alphabet = alphabet
        self.input_file = input_file

[docs]    def fit(self, X=None, y=None):
        """Simply assigns the right alphabet mapper function to remap_

        Parameters
        ----------
        X : None
            The pipeline API requires this parameter.

        y : None
            There is no need of a target in a transformer, yet the pipeline API
            requires this parameter.

        Attributes
        ----------
        remap_ : function that takes a tweet with the same format retrieved from
                GET statuses/user_timeline call and retrieves the corresponding character
                given the alphabet parameter.

        Returns
        -------
        self : object
            Returns an instance of self.
        """
        if self.alphabet == 'b3_type':
            self.remap_ = self._tweet2char_b3_type
        elif self.alphabet == 'b3_content':
            self.remap_ = self._tweet2char_b3_content
        elif self.alphabet == 'b6_content':
            self.remap_ = self._tweet2char_b6_content
        else:
            self.remap_ = None

        return self

[docs]    def transform(self, X=None):
        """ The function that transform the array of timelines to digital dna sequences given the alphabet.

        Parameters
        ----------
        X : array-like of shape = [# of tweets, 1]
            The input samples, each sample is a python dict of a tweet as retrieved
            from twitter user timelines API (check out
            https://developer.twitter.com/en/docs/tweets/timelines/api-reference/get-statuses-user_timeline.html)

        Returns
        -------
        X_transformed : array of string of shape = [# of users, 1]
            The array containing the digital dna sequences
            """
        if self.input_file != '':
            f = open(self.input_file, "r")
            X = json.loads(f.read())
        elif X == None:
            raise ValueError('X cannot be None if input_file is not specified')

        ddna_size = 0
        res = {}
        for row in X:
            uid = self._nested_get(row, 'user.id')
            code = self.remap_(row)
            if uid in res:
                res[uid] += code
            else:
                res[uid] = code
            ddna_size = max(ddna_size, len(res[uid]))
        # dtype = [('uid', 'i8'), ('ddna', 'U' + str(ddna_size))]
        res = np.array(list(res.items()))  # , dtype=dtype)
        return res

[docs]    def fit_transform(self, X=None, y=None):
        """ Fit and transform

        Parameters
        ----------
        X : array-like of shape = [# of tweets, 1]
            The input samples, each sample is a python dict of a tweet as retrieved
            from twitter user timelines API (check out
            https://developer.twitter.com/en/docs/tweets/timelines/api-reference/get-statuses-user_timeline.html)

        y: ignored parameter needed to mantain a standard pattern

        Returns
        -------
        X_transformed : array of shape = [n_samples, 2]
            The resulting array where the first column is the user id and the second is the translated sequence
        """
        return self.fit(X, y).transform(X)

[docs]    def get_params(self):
        """Get parameters for this estimator.

        Returns
        -------
        params : mapping of string to any
            Parameter names mapped to their values.
        """
        return {'alphabet': self.alphabet,
                'input_file': self.input_file}

[docs]    def set_params(self, alphabet, input_file):
        """Set the parameters of this estimator. The method works on simple estimators as well as on nested objects
        (such as pipelines). The latter have parameters of the form ``<component>__<parameter>`` so that it's possible
        to update each component of a nested object.

        Returns
        -------
        self
        """
        self.alphabet = alphabet
        self.input_file = input_file

    def _nested_get(self, dct, keys):
        for key in keys.split('.'):
            try:
                dct = dct[key]
            except KeyError:
                return None
        return dct

    def _tweet2char_b3_type(self, tweet):
        reply_id = tweet['in_reply_to_user_id']
        is_reply = reply_id != 0 and reply_id is not None
        retweet_id = self._nested_get(tweet, 'retweeted_status.id')
        is_retweet = retweet_id != 0 and retweet_id is not None

        if not is_reply and not is_retweet:
            return TWEET
        elif is_reply and not is_retweet:
            return REPLY
        return RETWEET

    def _tweet2char_b3_content(self, tweet):
        # sums not empty lists within entities dict
        n_entities = sum([1 for k, v in tweet['entities'].items() if v])

        if n_entities == 0:
            return NONE
        elif n_entities > 1:
            return MIXED
        return ENTITY

    def _tweet2char_b6_content(self, tweet):
        # sums not empty lists within entities dict
        n_entities = sum([1 for k, v in tweet['entities'].items() if v])

        if n_entities == 0:
            return NONE
        elif n_entities > 1:
            return MIXED
        else:
            type = [k for k, v in tweet['entities'].items() if v][0]
            if type == 'url':
                return URL
            elif type == 'hashtags':
                return HASHTAG
            elif type == 'user_mentions':
                return MENTION
            elif 'extended_entities' in tweet:
                return MEDIA
            return UNKNOWN