Source code for digitaldna.sequence_plots

from collections import Counter
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.utils.validation import check_array
from .twitter_sequencer import ENTITY, HASHTAG, MEDIA, MENTION, NONE, MIXED, REPLY, RETWEET, TWEET, URL, UNKNOWN

plt.rcParams['image.cmap'] = 'spring'


[docs]class SequencePlots(): """ The Digital DNA plots utility class. Parameters ---------- alphabet : string, optional The sequences' alphabet, used to show prettier labels, possible values are:\n - 'b3_type' - 'b3_content' - 'b6_content' - None Default: None """
[docs] def __init__(self, alphabet=None): import warnings warnings.simplefilter(action='ignore', category=FutureWarning) if alphabet == 'b3_type': self.alphabet = {TWEET: 'tweet', RETWEET: 'retweet', REPLY: 'reply'} elif alphabet == 'b3_content': self.alphabet = {NONE: 'no entity', MIXED: 'mixed', ENTITY: 'entity'} elif alphabet == 'b6_content': self.alphabet = {NONE: 'no entity', MIXED: 'mixed', URL: 'url', HASHTAG: 'hashtag', MENTION: 'mention', MEDIA: 'media', UNKNOWN: 'unknown entity'} else: self.alphabet = None
[docs] def plot_alphabet_distribution(self, X): """ This function produces a box plot where each box represents the distribution of a letter in the sequences. The alphabet is inferred from the sequences. Parameters ---------- X : array-like, shape (n_samples, 1), mandatory The input sequences of digital dna Returns ------- y : an instance of self """ check_array(X, ensure_2d=False, dtype=np.unicode_) length = len(X) freq = pd.DataFrame(index=range(length), columns=self._find_alphabet(X)) for i in range(length): freq.at[i] = dict(Counter(X[i])) freq = freq.dropna(axis='columns', how='all').fillna(0).astype(int) sns.boxplot(data=freq, color='cyan').set_ylabel('# of occurrences') sns.swarmplot(data=freq, color='red').set_xlabel('alphabet') return self
[docs] def plot_sequences_color(self, X): """ This function produces a matrix image where each row is a digital dna sequence and each letter is represented by a different color. Parameters ---------- X : array-like, shape (n_samples, 1), mandatory The input sequences of digital dna Returns ------- y : an instance of self """ check_array(X, ensure_2d=False, dtype=np.unicode_) X = np.array(sorted(X, key=len, reverse=True)) matrix = self._string_arr_to_int_matrix(X) remap = dict((k, i) for i, k in enumerate(np.unique(matrix))) cmap = ListedColormap( [plt.get_cmap("tab10")(i) if i > 0 or len(remap) == 1 else 'white' for i in remap.values()]) mat = plt.matshow(np.vectorize(remap.get)(matrix), cmap=cmap, aspect='auto') cbar = plt.colorbar(mat) if self.alphabet is None: labels = [chr(k) if k > 0 else 'no-data' for k in remap.keys()] else: labels = [chr(k) + ' (' + self.alphabet[chr(k)] + ')' if k > 0 else 'no-data' for k in remap.keys()] locs = np.arange(1 / (2 * len(labels)), 1, 1 / len(labels)) cbar.ax.get_yaxis().set(ticks=locs, ticklabels=labels) plt.show() return self
[docs] def plot_intrasequence_entropy(self, X): """ This function produces a box plot with a single box representing the distribution of the intra-sequence entropies (the Shannon Entropy computed over a single digital dna sequence). Parameters ---------- X : array-like, shape (n_samples, 1), mandatory The input sequences of digital dna Returns ------- y : an instance of self """ check_array(X, ensure_2d=False, dtype=np.unicode_) entropy = self._compute_entropy(X) ax = sns.boxplot(data=entropy, color="white") ax = sns.swarmplot(data=entropy, color="red") ax.set_ylabel("Intraseq Shannon Entropy") ax.set_yticks(np.arange(.0, 1.8, .2)) ax.set_xticklabels([]) ax.set_aspect(1.) return self
[docs] def plot_intersequence_entropy(self, X): """ This function produces a composite plot. On the left a boxplot representing the distribution of the inter-sequence entropy (Shannon's Entropy of the letters in the same with same sequence index but in different sequences). On the right a scatterplot of the entropies ordered by sequence index. Parameters ---------- X : array-like, shape (n_samples, 1), mandatory The input sequences of digital dna Returns ------- y : an instance of self """ check_array(X, ensure_2d=False, dtype=np.unicode_) entropy = self._compute_entropy(self._string_arr_to_int_matrix(X).T) f, axes = plt.subplots(1, 2, gridspec_kw={'width_ratios': [1, 3]}) unique = np.unique(entropy) ax = sns.boxplot(data=unique, color="white", ax=axes[0]) ax = sns.swarmplot(data=unique, color="red", ax=axes[0]) ax.set_ylabel("Interseq Shannon Entropy") ax.set_yticks(np.arange(.0, 1.8, .2)) ax.set_xticklabels([]) plt.plot(range(len(entropy)), entropy, '-x') plt.subplots_adjust(wspace=.5) return self
def _compute_entropy(self, X): arr_size = len(X) entropies = np.zeros((arr_size, 1), dtype=np.float32) for i in range(arr_size): non_zero_X = np.trim_zeros(X[i]) d = Counter(non_zero_X) # when sequences are trasposed, 0 means empty, so it's dropped curr_len = len(non_zero_X) prob_list = [count / curr_len for key, count in d.items()] entropies[i] = -sum(prob_list * np.log2(prob_list)) return entropies.flatten() def _string_arr_to_int_matrix(self, X): rowsize = len(X) colsize = len(max(X, key=len)) matrix = np.zeros((rowsize, colsize), dtype=np.int8) for i in range(rowsize): s = list(X[i]) length = len(s) for j in range(length): matrix[i, j] = ord(s[j]) return matrix def _find_alphabet(self, X): letters = [list(set(s)) for s in X] return list(set(item for sublist in letters for item in sublist))