Source code for stickydesign.stickydesign

from __future__ import division
import numpy as np
import itertools
import logging

from .endclasses import endarray, lton, wc
from .energetics_basic import EnergeticsBasic

LOGGER = logging.getLogger(__name__)


[docs]def values_chunked(items, endtype, chunk_dim=10): """ Given a list of lists of acceptable numbers for each position in a row of an array, create every possible row, and return an iterator that returns chunks of every possible row up to chunk_dim, iterating dimensions higher than chunk_dim. This probably doesn't need to be called directly, and may have a _ added in the future. Return this as an endarray, with set endtype. This can be easily emoved for use elsewhere. """ ilengths = [len(x) for x in items] n = len(items) items = [np.array(x) for x in items] if n > chunk_dim: p = n - chunk_dim q = chunk_dim outer = itertools.product(*(items[0:p])) else: p = 0 q = n def outer_iter(): yield () outer = outer_iter() chunk = np.zeros( [np.prod(ilengths[p:]), len(items)], dtype=int).view(endarray) chunk.endtype = endtype chunk[:, p:] = np.indices(ilengths[p:]).reshape(q, -1).T for i in range(p, n): chunk[:, i] = items[i][chunk[:, i]] for seq in outer: chunk[:, :p] = seq yield chunk
[docs]def get_accept_set(endtype, length, interaction, fdev, maxendspurious, spacefilter=None, adjacents=['n', 'n'], alphabet='n', energetics=None): if not energetics: energetics = EnergeticsBasic() if not spacefilter: spacefilter = spacefilter_standard(interaction, interaction * fdev, maxendspurious) # Generate the template. if endtype == 'DT': template = [lton[adjacents[0]]] + [lton[alphabet.lower()]] \ * length + [lton[wc[adjacents[1]]]] elif endtype == 'TD': template = [lton[wc[adjacents[1]]]] + [lton[alphabet.lower()]] \ * length + [lton[adjacents[0]]] elif endtype == 'S': template = [lton[alphabet.lower()]]*length LOGGER.info("Length {0}, type {1}, adjacents {2}, alphabet {3}.".format( length, endtype, adjacents, alphabet)) LOGGER.debug("Have template {0}.".format(template, endtype)) # Create the chunk iterator endchunk = values_chunked(template, endtype) # Use spacefilter to filter chunks down to usable sequences matcharrays = [] chunknum = 0 totchunks = None totends = np.product([len(x) for x in template]) LOGGER.debug( "Have {0} ends in total before any filtering.".format(totends)) for chunk in endchunk: matcharrays.append(spacefilter(chunk, energetics)) if not totchunks: totchunks = totends // len(chunk) chunknum += 1 LOGGER.debug("Found {0} filtered ends in chunk {1} of {2}.".format( len(matcharrays[-1]), chunknum, totchunks)) LOGGER.debug("Done with spacefiltering.") availends = np.vstack(matcharrays).view(endarray) availends.endtype = endtype return availends
def _make_avail(endtype, length, spacefilter, endfilter, endchooser, energetics, adjacents=['n', 'n'], num=0, numtries=1, oldendfilter=None, oldends=[], alphabet='n'): # Generate the template. if endtype == 'DT': template = [lton[adjacents[0]]] + [lton[alphabet.lower()]] \ * length + [lton[wc[adjacents[1]]]] elif endtype == 'TD': template = [lton[wc[adjacents[1]]]] + [lton[alphabet.lower()]] \ * length + [lton[adjacents[0]]] elif endtype == 'S': template = [lton[alphabet.lower()]]*length LOGGER.info("Length {0}, type {1}, adjacents {2}, alphabet {3}.".format( length, endtype, adjacents, alphabet)) LOGGER.debug("Have template {0}.".format(template, endtype)) # Create the chunk iterator endchunk = values_chunked(template, endtype) # Use spacefilter to filter chunks down to usable sequences matcharrays = [] chunknum = 0 totchunks = None totends = np.product([len(x) for x in template]) LOGGER.debug( "Have {0} ends in total before any filtering.".format(totends)) for chunk in endchunk: matcharrays.append(spacefilter(chunk, energetics)) if not totchunks: totchunks = totends // len(chunk) chunknum += 1 LOGGER.debug("Found {0} filtered ends in chunk {1} of {2}.".format( len(matcharrays[-1]), chunknum, totchunks)) LOGGER.debug("Done with spacefiltering.") availends = np.vstack(matcharrays).view(endarray) availends.endtype = endtype # Use endfilter to filter available sequences taking into account old # sequences. if len(oldends) > 0: if oldendfilter: availends = oldendfilter(oldends, None, availends, energetics) else: availends = endfilter(oldends, None, availends, energetics) return availends
[docs]def find_end_set_uniform(endtype, length, spacefilter, endfilter, endchooser, energetics, adjacents=['n', 'n'], num=0, numtries=1, oldendfilter=None, oldends=[], alphabet='n', _presetavail=False): """ Find a set of ends of uniform length and type satisfying uniform constraint functions (eg, constrant functions are the same for each end). This function is intended to be complicated and featureful. If you want something simpler, try easy_ends Parameters ---------- endtype : str right now 'DT' for 3'-terminal ends, and 'TD' for 5'-terminal ends, length : int length of ends, not including adjacent bases, if applicable. adjacents : list of str (defaults to ['n','n']): acceptable bases for adjacents (eg, ['n','n'] or ['c', 'c']) for the ends and their complements, num : int (defaults to 0): number of ends to find (0 keeps finding until available ends are exhausted) numtries : int (defaults to 1): if > 1, the function will return a list of sets of ends that all individually satisfy the constraints, so that the best one can be selected manually spacefilter: function a "spacefilter" function that takes endarrays and filters them down to ends that, not considering spurious interactions, are acceptable. endfilter: function an "endfilter" function that takes current ends in the set, available ends (filtered with current ends), and new ends added, and filters the available ends, considering interactions between ends (eg, spurious interactions). endchooser : function an "endchooser" function that takes current ends in the set and available ends, and returns a new end to add to the set. energetics : function an "energyfunctions" class that provides the energy functions for everything to use. oldends : endarray an endarray of old ends to consider as part of the set alphabet : str a single letter specifying what the alphabet for the ends should be (eg, four or three-letter code) oldendfilter : str a different "endfilter" function for use when filtering the available ends using interactions with old ends. This is normally not useful, but can be useful if you want, for example, to create a sets with higher cross-interactions between two subsets than within the two subsets. Returns ------- endarray an endarray of generated ends, including provided old ends """ if len(oldends) > 0: if type(oldends[0]) is str: oldends = endarray(oldends, endtype) if isinstance(_presetavail, endarray): startavail = _presetavail else: startavail = _make_avail(endtype, length, spacefilter, endfilter, endchooser, energetics, adjacents, num, numtries, oldendfilter, oldends, alphabet) endsets = [] availends = startavail.copy() LOGGER.debug("Starting with {0} ends.".format(len(availends))) while len(endsets) < numtries: curends = oldends availends = startavail.copy() numends = 0 while True: newend = endarray( np.array([endchooser(curends, availends, energetics)]), endtype) LOGGER.debug("Chose end {0}.".format(repr(newend))) newend.endtype = endtype availends = endfilter(newend, curends, availends, energetics) LOGGER.debug("Done filtering.") if curends is None: curends = newend elif len(curends) == 0: curends = newend else: curends = curends.append(newend) numends += 1 LOGGER.debug("Now have {0} ends in set, and {1} ends available.".format(numends, len(availends))) if len(availends) == 0: LOGGER.info("Found {0} ends.".format(numends)) break if numends >= num and num > 0: break endsets.append(curends) # Verification: # Note: this currently gives weird output that is not helpful when it fails. # But if this fails, you've done something very weird, most likely, because # this is just internal sanity checking. for endset in endsets: oldr = np.arange(0, len(oldends)) newr = np.arange(len(oldends), len(endset)) allr = np.arange(0, len(endset)) # All new ends must satisfy old ends: if oldendfilter is None and len(oldends) > 0: assert np.asarray( endfilter(endset[oldr, :], None, endset[newr, :], energetics) == endset[newr, :]).all() elif len(oldends) > 0: assert np.asarray( oldendfilter(endset[oldr, :], None, endset[newr, :], energetics) == endset[newr, :]).all() # Each new end must allow all others for i in newr: if oldendfilter is None: assert np.asarray( endfilter(endset[i, :][None, :], None, endset, energetics) == endset[i != allr, :]).all() else: assert np.asarray( oldendfilter(endset[i, :][None, :], None, endset[oldr, :], energetics) == endset[oldr, :]).all() assert np.asarray( endfilter(endset[i, :][None, :], None, endset[newr, :], energetics) == endset[newr[i != newr], :]).all() if len(endsets) > 1: return endsets else: if _presetavail is None or isinstance(_presetavail,endarray): return endsets[0], startavail else: return endsets[0]
[docs]def enhist(endtype, length, adjacents=['n', 'n'], alphabet='n', bins=None, energetics=None, plot=False, color='b'): if endtype == 'DT': template = [lton[adjacents[0]]] +\ [lton[alphabet.lower()]] * length + [lton[wc[adjacents[1]]]] elif endtype == 'TD': template = [lton[wc[adjacents[1]]]] +\ [lton[alphabet.lower()]] * length + [lton[adjacents[0]]] elif endtype == 'S': template = [lton[alphabet.lower()]]*length if not energetics: energetics = EnergeticsBasic() minbin = 0.8 * energetics.matching_uniform( endarray([([0, 3] * length)[0:length + 2]], endtype)) maxbin = 1.1 * energetics.matching_uniform( endarray([([1, 2] * length)[0:length + 2]], endtype)) if not bins: bins = np.arange(minbin, maxbin, 0.1) LOGGER.debug("Have template {0} and type {1}.".format(template, endtype)) # Create the chunk iterator endchunk = values_chunked(template, endtype) hist = np.zeros(len(bins) - 1, dtype='int') totends = np.product([len(x) for x in template]) finishedends = 0 info = {'min': np.inf, 'max': -np.inf, 'mean': 0} for chunk in endchunk: matchens = energetics.matching_uniform(chunk) hist += np.histogram(matchens, bins)[0] info['max'] = max(info['max'], np.amax(matchens)) info['min'] = min(info['min'], np.amin(matchens)) info['mean'] = (info['mean']*(finishedends)/(len(chunk)+finishedends) + np.mean(matchens) * len(chunk) / (len(chunk)+finishedends)) finishedends += len(matchens) LOGGER.debug("Done with {0}/{1} ends.".format(finishedends, totends)) x = (bins[:-1] + bins[1:]) / 2 n = hist info['emean'] = np.sum(n * x, dtype='double') / np.sum(n, dtype='int64') info['estd'] = np.sqrt( np.sum(n * ( x - info['emean'])**2, dtype='double') / np.sum(n, dtype='int64')) cs = np.cumsum(n) info['emedian'] = x[np.flatnonzero(cs >= cs[-1] / 2.0)[0]] if plot: import matplotlib.pyplot as plt plt.bar( bins[:-1], hist, width=(bins[1] - bins[0]), label="Type {3}, Length {0}, Adjs {1}, Alph {2}".format( length, adjacents, alphabet, endtype), color=color) plt.title( "Matching Energies of Ends of Type {3}, Length {0}, Adjs {1}, Alph {2}". format(length, adjacents, alphabet, endtype)) plt.xlabel("Standard Free Energy (-kcal/mol)") plt.ylabel("Number of Ends") # plt.show() return (hist, bins, info)
[docs]def easyends(endtype, endlength, number=0, interaction=None, fdev=0.05, maxspurious=0.5, maxendspurious=None, tries=1, oldends=[], adjs=['n', 'n'], energetics=None, alphabet='n', echoose=None, absolute=False, _presetavail=False): """ Easyends is an attempt at creating an easy-to-use function for finding sets of ends. * endtype: specifies the type of end being considered. The system for classifying end types goes from 5' to 3', and consists of letters describing each side of the end. For example, an end that starts after a double-stranded region on the 5' side and ends at the end of the strand would be 'DT', while one that starts at the beginning of a strand on the 5' side and ends in a double-stranded region would be 'TD'. 'T' stands for terminal, 'D' stands for double-stranded region, and 'S' stands for single-stranded region. 'S', however, is not currently supported. * endlength: specifies the length of end being considered, not including adjacent bases. * number (optional): specifies the number of ends to find. If zero or not provided, easyends tries to find as many ends as possible. * interaction (optional): a positive number corresponding to the desired standard free energy for hybridization of matching sticky ends. If not provided, easyends calculates an optimal value based on the sequence space. * fdev (default 0.05): the fractional deviation (above or below) of allowable matching energies. maxspurious (default 0.5): the maximum spurious interaction, as a fraction of the matching interaction. * maxendspurious (default None): if provided, maxspurious is only used for spurious interactions between ends defined as ends, and ends defined as complements. Maxendspurious is then the maximum spurious interaction between ends and ends, and complements and complements. In a system where spurious interactions between ends and complements are more important than other spurious interactions, this can allow for better sets of ends. * tries (default 1): if > 1, easyends will return a list of sets of ends, all satisfying the constraints. * oldends (optional): a list of ends to be considered as already part of the set. * adjacents (default ['n','n']): allowable adjacent bases for ends and complements. * absolute (default False): fdev, maxspurious, and maxendspurious to be interpreted as absolute kcal/mol values rather than fractional values. * energetics (optional): an energetics class providing the energy calculation functions. You probably don't need to change this. * alphabet (default 'n'): The alphabet to use for ends, allowing for three-letter codes. """ if not energetics: efunc = EnergeticsBasic() else: efunc = energetics if (not interaction) or (interaction == 0): interaction = enhist( endtype, endlength, energetics=efunc, adjacents=adjs, alphabet=alphabet)[2]['emedian'] LOGGER.info("Calculated optimal interaction energy is {0}.".format( interaction)) if not absolute: mult = interaction else: mult = 1.0 maxcompspurious = maxspurious * mult if not maxendspurious: maxendspurious = maxspurious * mult else: maxendspurious = maxendspurious * mult sfilt = spacefilter_standard(interaction, mult * fdev, maxendspurious) efilt = endfilter_standard_advanced(maxcompspurious, maxendspurious) if not echoose: echoose = endchooser_standard(interaction) return find_end_set_uniform( endtype, endlength, sfilt, efilt, echoose, energetics=efunc, numtries=tries, oldends=oldends, adjacents=adjs, num=number, alphabet=alphabet, _presetavail=_presetavail)
[docs]def easy_space(endtype, endlength, interaction=None, fdev=0.05, maxspurious=0.5, maxendspurious=None, tries=1, oldends=[], adjs=['n', 'n'], energetics=None, alphabet='n', echoose=None): length = endlength if not energetics: efunc = EnergeticsBasic() energetics = efunc else: efunc = energetics if (not interaction) or (interaction == 0): interaction = enhist( endtype, endlength, energetics=efunc, adjacents=adjs, alphabet=alphabet)[2]['emedian'] LOGGER.info("Calculated optimal interaction energy is {0}.".format( interaction)) maxcompspurious = maxspurious * interaction if not maxendspurious: maxendspurious = maxspurious * interaction else: maxendspurious = maxendspurious * interaction sfilt = spacefilter_standard(interaction, interaction * fdev, maxendspurious) spacefilter = sfilt if not echoose: echoose = endchooser_standard(interaction) adjacents = adjs if endtype == 'DT': template = [lton[adjacents[0]]] + [lton[alphabet.lower()]] \ * length + [lton[wc[adjacents[1]]]] elif endtype == 'TD': template = [lton[wc[adjacents[1]]]] + [lton[alphabet.lower()]] \ * length + [lton[adjacents[0]]] # Create the chunk iterator endchunk = values_chunked(template, endtype) # Use spacefilter to filter chunks down to usable sequences matcharrays = [] chunknum = 0 totchunks = None totends = np.product([len(x) for x in template]) LOGGER.info( "Have {0} ends in total before any filtering.".format(totends)) for chunk in endchunk: matcharrays.append(spacefilter(chunk, energetics)) if not totchunks: totchunks = totends // len(chunk) chunknum += 1 LOGGER.debug("Found {0} filtered ends in chunk {1} of {2}.".format( len(matcharrays[-1]), chunknum, totchunks)) LOGGER.debug("Done with spacefiltering.") availends = np.vstack(matcharrays).view(endarray) availends.endtype = endtype availendsr = np.repeat(availends, len(availends), axis=0) availendst = np.tile(availends, (len(availends), 1)) vals_ee = energetics.uniform(availendsr.ends, availendst.ends) vals_ec = energetics.uniform(availendsr.ends, availendst.comps) vals_ce = energetics.uniform(availendsr.comps, availendst.ends) vals_cc = energetics.uniform(availendsr.comps, availendst.comps) vals_tf = ((vals_ee < maxendspurious) & (vals_cc < maxendspurious) & (vals_ec < maxcompspurious) & (vals_ce < maxcompspurious)) zipendsnf = zip(availendsr.tolist(), availendst.tolist()) zipends = [zipendsnf[x] for x in np.flatnonzero(vals_tf)] return zipends
[docs]def spacefilter_standard(desint, dev, maxself): """ A spacefilter function: filters to ends that have a end-complement interaction of between desint-dev and desint+dev, and a self-interaction (end-end or comp-comp) of less than maxself. """ def spacefilter(fullends, energetics): matchenergies = energetics.matching_uniform(fullends) g4 = np.zeros(fullends.shape[0]) for w in range(0, (fullends.shape[1] - 3)): g4 += (np.sum( np.array(fullends[:, w:(w + 4)] == [2, 2, 2, 2]), axis=1) == 4) g4 += (np.sum( np.array(fullends[:, w:(w + 4)] == [1, 1, 1, 1]), axis=1) == 4) i = np.flatnonzero((matchenergies < desint + dev) & (matchenergies > desint - dev) & (g4 == 0)) matchenergies = matchenergies[i] fullends = fullends[i] selfselfenergies = energetics.uniform(fullends.ends, fullends.ends) compcompenergies = energetics.uniform(fullends.comps, fullends.comps) i = np.flatnonzero((selfselfenergies < maxself) & (compcompenergies < maxself)) return fullends[i] return spacefilter
[docs]def endfilter_standard(maxspurious): """ An endfilter function: filters out ends that have any (end-end, end-comp, comp-end, comp-comp) interactions with new ends above maxspurious. """ def endfilter(newends, currentends, availends, energetics): endendspurious = energetics.uniform( np.repeat(newends.ends, len(availends), 0), np.tile(availends.ends, (len(newends), 1))).reshape( len(availends), len(newends), order='F') endcompspurious = energetics.uniform( np.repeat(newends.ends, len(availends), 0), np.tile(availends.comps, (len(newends), 1))).reshape( len(availends), len(newends), order='F') compendspurious = energetics.uniform( np.repeat(newends.comps, len(availends), 0), np.tile(availends.ends, (len(newends), 1))).reshape( len(availends), len(newends), order='F') compcompspurious = energetics.uniform( np.repeat(newends.comps, len(availends), 0), np.tile(availends.comps, (len(newends), 1))).reshape( len(availends), len(newends), order='F') highspurious = np.amax( np.hstack((endendspurious, compendspurious, endcompspurious, compcompspurious)), 1) return availends[highspurious < maxspurious] return endfilter
[docs]def endfilter_standard_advanced(maxcompspurious, maxendspurious): """ An endfilter function: filters out ends that have end-comp or comp-end interactions above maxcompspurious, and end-end or comp-comp interactions above maxendspurious. """ def endfilter(newends, currentends, availends, energetics): endendspurious = energetics.uniform( np.repeat(newends.ends, len(availends), 0), np.tile(availends.ends, (len(newends), 1))).reshape( len(availends), len(newends), order='F') endcompspurious = energetics.uniform( np.repeat(newends.ends, len(availends), 0), np.tile(availends.comps, (len(newends), 1))).reshape( len(availends), len(newends), order='F') compendspurious = energetics.uniform( np.repeat(newends.comps, len(availends), 0), np.tile(availends.ends, (len(newends), 1))).reshape( len(availends), len(newends), order='F') compcompspurious = energetics.uniform( np.repeat(newends.comps, len(availends), 0), np.tile(availends.comps, (len(newends), 1))).reshape( len(availends), len(newends), order='F') highendspurious = np.amax( np.hstack((endendspurious, compcompspurious)), 1) highcompspurious = np.amax( np.hstack((compendspurious, endcompspurious)), 1) return availends[(highendspurious < maxendspurious) & (highcompspurious < maxcompspurious)] return endfilter
[docs]def energy_array_uniform(seqs, energetics): """ Given an endarray and a set of sequences, return an array of the interactions between them, including their complements. """ seqs = seqs.ends.append(seqs.comps) return energetics.uniform( np.repeat(seqs, seqs.shape[0], 0), np.tile( seqs, (seqs.shape[0], 1))).reshape((seqs.shape[0], seqs.shape[0]))
[docs]def endchooser_standard(desint, wiggle=0.0): """ An endchooser function: return a random end with end-comp energy closest to desint. """ def endchooser(currentends, availends, energetics): ddiff = np.abs(energetics.matching_uniform(availends) - desint) choices = np.flatnonzero(ddiff <= np.amin(ddiff)+wiggle) newend = availends[choices[np.random.randint(0, len(choices))]] return newend return endchooser
[docs]def endchooser_random(): """ An endchooser function: return a random end with end-comp energy closest to desint. """ def endchooser(currentends, availends, energetics): newend = availends[np.random.randint(0, len(availends))] return newend return endchooser