Sign up to create your own snipts, or login.

Public snipts » dgg32's snipts The latest snipts from dgg32.

showing 1-18 of 18 snipts
  • calculate p value to determine if null hypothesis is true
    #!/usr/bin/env python
    
    
    import cmath
    import sys
    
    def normsdist (x, mean, standard_dev):
        res = 0
        
        x = (x -mean) / standard_dev
        
        if x == 0:
            res = 0.5
        else:
            oor2pi = 1 /(cmath.sqrt(2 * 3.14159265358979323846))
            t = 1 / (1 + 0.2316419 * abs(x))
            t = t * oor2pi * cmath.exp(-0.5 * x * x) * (0.31938153 + t * (-0.356563782 + t * (1.781477937 + t * (-1.821255978 + t * 1.330274429))))
            if x >= 0:
                res = 1 - t
            else:
                res = t
        return res
    
    
    def calcP(Untergewicht_Nichtraucherin, Untergewicht_Raucherin, nicht_Geburten_Nichtraucherin, nicht_Geburten_Raucherin):
        Total_Nichtraucherin = Untergewicht_Nichtraucherin+ nicht_Geburten_Nichtraucherin
        Total_Raucherin = Untergewicht_Raucherin + nicht_Geburten_Raucherin
        
        Quote_Nichtraucherin = Untergewicht_Nichtraucherin/Total_Nichtraucherin
        Quote_Raucherin = Untergewicht_Raucherin/Total_Raucherin
        
        Summe_Untergewicht = Untergewicht_Nichtraucherin + Untergewicht_Raucherin
        Summe_nicht_Geburten = nicht_Geburten_Nichtraucherin + nicht_Geburten_Raucherin
        
        Summe_Total = Summe_Untergewicht + Summe_nicht_Geburten
        Quote_Summe = Summe_Untergewicht/Summe_Total
        
        Testwert = (Untergewicht_Nichtraucherin*nicht_Geburten_Raucherin-nicht_Geburten_Nichtraucherin*Untergewicht_Raucherin)/cmath.sqrt(Summe_Untergewicht*Summe_nicht_Geburten*Total_Nichtraucherin*Total_Raucherin/Summe_Total)
    
        P = 2*normsdist(-abs(Testwert), 0, 1)
        return P
    
    
    Untergewicht_Nichtraucherin = int(sys.argv[1])
    Untergewicht_Raucherin = int(sys.argv[2])
    nicht_Geburten_Nichtraucherin = int(sys.argv[3])
    nicht_Geburten_Raucherin = int(sys.argv[4])
    P = calcP(Untergewicht_Nichtraucherin, Untergewicht_Raucherin, nicht_Geburten_Nichtraucherin, nicht_Geburten_Raucherin)
    
    print P
    
    if abs(P) < 0.05:
        print "alternative hypo, p1 != p2"
    else:
        print "null hypo, p1 == p2"
    

    copy | embed

    0 comments - tagged in  posted by dgg32 on Mar 03, 2010 at 6:51 p.m. EST
  • normsdist, returns the probability that the observed value of a standard normal random variable will be less than or equal to z. A standard normal random variable has mean 0 and standard deviation 1 (and also variance 1 because variance = standard deviation squared).
    #!/usr/bin/env python
    
    import cmath
    import sys
    
    def normsdist (x, mean, standard_dev):
        res = 0
        
        x = (x -mean) / standard_dev
        
        if x == 0:
            res = 0.5
        else:
            oor2pi = 1 /(cmath.sqrt(2 * 3.14159265358979323846))
            t = 1 / (1 + 0.2316419 * abs(x))
            t = t * oor2pi * cmath.exp(-0.5 * x * x) * (0.31938153 + t * (-0.356563782 + t * (1.781477937 + t * (-1.821255978 + t * 1.330274429))))
            if x >= 0:
                res = 1 - t
            else:
                res = t
        return res
    
    x = sys.argv[1]
    print normsdist (float(x), 0, 1)
    

    copy | embed

    0 comments - tagged in  posted by dgg32 on Mar 03, 2010 at 6:05 p.m. EST
  • Translate_EMBL
    #include <QString>
    #include <QHash>
    #include <iostream>
    #include <QTextStream>
    
    void translateThis (QString DNA, int offset, QHash <QString, QString> CodonTable);
    
    
    int main (int agrc, char** argv)
    {
    
        QHash <QString, QString> CodonTable;
        CodonTable.insert ("TTT", "F");
        CodonTable.insert ("TTC", "F");
        CodonTable.insert ("TTA", "L");
        CodonTable.insert ("TTG", "L");
        CodonTable.insert ("CTT", "L");
        CodonTable.insert ("CTC", "L");
        CodonTable.insert ("CTA", "L");
        CodonTable.insert ("CTG", "L");
        CodonTable.insert ("ATT", "I");
        CodonTable.insert ("ATC", "I");
        CodonTable.insert ("ATA", "I");
        CodonTable.insert ("ATG", "M");
        CodonTable.insert ("GTT", "V");
        CodonTable.insert ("GTC", "V");
        CodonTable.insert ("GTA", "V");
        CodonTable.insert ("GTG", "V");
        CodonTable.insert ("TCT", "S");
        CodonTable.insert ("TCC", "S");
        CodonTable.insert ("TCA", "S");
        CodonTable.insert ("TCG", "S");
        CodonTable.insert ("CCT", "P");
        CodonTable.insert ("CCC", "P");
        CodonTable.insert ("CCA", "P");
        CodonTable.insert ("CCG", "P");
        CodonTable.insert ("ACT", "T");
        CodonTable.insert ("ACC", "T");
        CodonTable.insert ("ACA", "T");
        CodonTable.insert ("ACG", "T");
        CodonTable.insert ("GCT", "A");
        CodonTable.insert ("GCC", "A");
        CodonTable.insert ("GCA", "A");
        CodonTable.insert ("GCG", "A");
        CodonTable.insert ("TAT", "Y");
        CodonTable.insert ("TAC", "Y");
        CodonTable.insert ("TAA", "*");
        CodonTable.insert ("TAG", "*");
        CodonTable.insert ("CAT", "H");
        CodonTable.insert ("CAC", "H");
        CodonTable.insert ("CAA", "Q");
        CodonTable.insert ("CAG", "Q");
        CodonTable.insert ("AAT", "N");
        CodonTable.insert ("AAC", "N");
        CodonTable.insert ("AAA", "K");
        CodonTable.insert ("AAG", "K");
        CodonTable.insert ("GAT", "D");
        CodonTable.insert ("GAC", "D");
        CodonTable.insert ("GAA", "E");
        CodonTable.insert ("GAG", "E");
        CodonTable.insert ("TGT", "C");
        CodonTable.insert ("TGC", "C");
        CodonTable.insert ("TGA", "*");
        CodonTable.insert ("TGG", "W");
        CodonTable.insert ("CGT", "R");
        CodonTable.insert ("CGC", "R");
        CodonTable.insert ("CGA", "R");
        CodonTable.insert ("CGG", "R");
        CodonTable.insert ("AGT", "S");
        CodonTable.insert ("AGC", "S");
        CodonTable.insert ("AGA", "R");
        CodonTable.insert ("AGG", "R");
        CodonTable.insert ("GGT", "G");
        CodonTable.insert ("GGC", "G");
        CodonTable.insert ("GGA", "G");
        CodonTable.insert ("GGG", "G");
    
    
        QTextStream in (stdin);
        while (!in.atEnd())
        {
            QString Line = in.readLine();
    
            if (!Line.contains(">"))
            {
                translateThis (Line, 0, CodonTable);
            }
        }
    }
    
    void translateThis (QString DNA, int offset, QHash <QString, QString> CodonTable)
    {
    
        QString Protein;
        for (int i = offset-1; i != DNA.length(); i += 3)
        {
            QString Aminoacid = CodonTable[DNA.mid(i, 3)];
            if (Aminoacid != "*")
            {
                Protein += Aminoacid;
            }
            else
            {
                break;
            }
        }
    }
    

    copy | embed

    0 comments - tagged in  posted by dgg32 on Feb 19, 2010 at 12:28 p.m. EST
  • update cdna counts in region_read
    #!/usr/bin/env python
    
    import sys
    import string
    import MySQLdb
    import re
    
    
    hostname = sys.argv[1]
    dbname = sys.argv[2]
    username = sys.argv[3]
    password = sys.argv[4]
    
    
    
    conn = MySQLdb.connect (host = hostname,
                               user = username,
                               passwd = password,
                               db = dbname)
    
    cursor = conn.cursor ()
    
    getRead = "SELECT mapped_start, mapped_stop, parent_region_id FROM Silmaril_SSAHA2_cDNA_Mapping"
    
    cursor.execute (getRead)
    
    resultset = cursor.fetchall()
    
    region_reads = {}
    
    for result in resultset:
        contig_id = result[2]
        start = result[0]
        stop = result[1]
        
        getRegionCDS = "SELECT stop, start, _id FROM Region WHERE parent_region_id = '" + str(contig_id) + "' OR _id = '" + str(contig_id) + "';"
        
        cursor.execute (getRegionCDS)
        
        regions = cursor.fetchall()
        
        for region in regions:
            region_start = region [1]
            region_stop = region [0]
            region_id = region[2]
            
            if (max (start, stop) >= min(region_start, region_stop) and max (start, stop) <= max (region_start, region_stop)) or (min (start, stop) >= min (region_start, region_stop) and min(start, stop) <= max (region_start, region_stop)) or (min (start, stop) <= min(region_start, region_stop) and max (start, stop) >= max(region_start, region_stop)):
                if region_id in region_reads:
                    region_reads [region_id] = region_reads [region_id] + 1
                else:
                    region_reads [region_id] = 1
    
    for key in region_reads.keys():
        insert = "UPDATE `Region_Read` SET cdna = '" + str(region_reads[key]) + "' WHERE region_id = '" + str(key) + "';"
        #print insert
        cursor.execute (insert)
    

    copy | embed

    0 comments - tagged in  posted by dgg32 on Feb 16, 2010 at 8:33 a.m. EST
  • count the read amount of contigs and regions
    #!/usr/bin/env python
    
    import sys
    import string
    import MySQLdb
    import re
    
    
    hostname = sys.argv[1]
    dbname = sys.argv[2]
    username = sys.argv[3]
    password = sys.argv[4]
    
    
    
    conn = MySQLdb.connect (host = hostname,
                               user = username,
                               passwd = password,
                               db = dbname)
    
    cursor = conn.cursor ()
    
    getRead = "SELECT mapped_start, mapped_stop, region_id FROM Ace2mapable"
    
    cursor.execute (getRead)
    
    resultset = cursor.fetchall()
    
    region_reads = {}
    
    for result in resultset:
        contig_id = result[2]
        start = result[0]
        stop = result[1]
        
        getRegionCDS = "SELECT stop, start, _id FROM Region WHERE parent_region_id = '" + str(contig_id) + "' OR _id = '" + str(contig_id) + "';"
        
        cursor.execute (getRegionCDS)
        
        regions = cursor.fetchall()
        
        for region in regions:
            region_start = region [1]
            region_stop = region [0]
            region_id = region[2]
            
            if (max (region_start, region_stop) >= min (start, stop) and min (region_start, region_stop) <= min (start, stop)) or (max(region_start, region_stop) >= max (start, stop) and min(region_start, region_stop) <= max (start, stop)) or (min (region_start, region_stop) <= min (start, stop) and max (region_start, region_stop) >= max (start, stop)):
                if region_id in region_reads:
                    region_reads [region_id] = region_reads [region_id] + 1
                else:
                    region_reads [region_id] = 1
    
    for key in region_reads.keys():
        insert = "INSERT INTO `Region_Read`(`region_id`, `read_count`) VALUES ('" + str(key) + "', '" + str(region_reads[key]) + "';"
        cursor.execute (insert)
            
        
        
    

    copy | embed

    0 comments - tagged in  posted by dgg32 on Feb 13, 2010 at 4:50 p.m. EST
  • detect duplicate names in fasta and give a suggestion
    import sys
    from sets import Set
    from collections import defaultdict
    import random
    
    def get_optimal (start, length, reverse_Collision):
        optimal = list ()
        counter = 0
        stop = 0
        for i in sorted(reverse_Collision):
            if stop == 1:
                break
            
            for j in sorted(reverse_Collision[i]):
            
                if counter >= start:
                    optimal.append (j)
                #print reverse_Collision[i][j]
                counter = counter + 1
                
                if (counter-start) == Length:
                    stop = 1
                    break
        return optimal
    
    Trial = 2
    Length = 7
    Collision = dict()
    Collection = []
    
    allNames = list()
    
    for line in sys.stdin:
        allNames.append (line.rstrip())
        for i in range(len(line.rstrip())):
            if len(Collection) == i:
            
                #print "first if "  + line [i]
                Collection.append(Set())
                Collection[i].add(line[i])
                Collision[i] = 0
            else:
                #print "first else " + line [i]
                if line[i] in Collection[i]:
                    Collision[i] = Collision[i] + 1
                else:
                    Collection[i].add (line[i])
                    
    reverse_Collision = defaultdict(list)
    
    for key in Collision:
        #print key
        reverse_Collision[Collision[key]].append(key)
        #print Collision[key]
        
    #print reverse_Collision
    
    
    fail = 2
    start = 0
    taken = Set()
    optimal = list()
    
    while fail != 0:
        optimal = get_optimal (start, Length, reverse_Collision)
        times = 0
        
        if (len(taken) + 1) >= len(allNames):
            fail = 0
            
    #print "len(allNames) - 1  " + str(len(allNames) - 1)
    
        if fail != 0:
            index = random.randint(0, len(allNames) - 1)
    
    
            while index in taken: 
    
                index = random.randint(0, len(allNames) - 1)
            
            taken.add (index)
    
            sample = ""
            for i in range(len(optimal)):
                sample = sample + allNames[index][optimal[i]]
    
    #print sample
    
    
            for i in range (len(allNames)):
        
                if fail == 1:
                    output = ""
                    for k in range(len(optimal)):
                        output = output + " " + str(optimal[k])
                    print "Positions" + output + " fail."
                    break
                
                if i not in taken:
                    times = times + 1
                
                
            
                    target = ""
                    for j in range(len(optimal)):
                        target = target + allNames[i][optimal[j]]
            
                
                    if sample == target:
                        print "Collision found"
                        fail = 1
                        start = start + 1
                        break
                
                    print times
        if times == Trial:
            fail = 0
            print "Try " + str(times) + " times. No collision found!"
    
    output = ""
    if fail == 0:
        for k in range(len(optimal)):
            output = output + " " + str(optimal[k])
        print "I suggest positions " + output + "."
        #print i
        
    #print sorted_Collision_values
        
    

    copy | embed

    0 comments - tagged in  posted by dgg32 on Feb 06, 2010 at 5:25 a.m. EST
  • correct the names in ANME1Endseqs GenDB project
    import sys
    import re
    from sets import Set
    import MySQLdb
    
    
    hostname = sys.argv[1]
    dbname = sys.argv[2]
    username = sys.argv[3]
    password = sys.argv[4]
    
    
    
    conn = MySQLdb.connect (host = hostname,
                               user = username,
                               passwd = password,
                               db = dbname)
    
    cursor = conn.cursor ()
    
    
    f = open ('/megx/home/shuang/sequences/BSMallEndseqForHanno.fas')
    
    pms = set()
    
    
    for line in f.readlines(): 
    
        if line.startswith(">"):
            result = line.rstrip().split (" ")
            #print result
            for pm_result in result:
                pms.add(pm_result[1:])
           
            
            
    f.close()
    
    #print pms
    
    
    
    f_match = re.compile (r'f_(\d+)([a-z]+)_(\w+)')
    
    a_match_2 = re.compile (r'a4_([0-9a-z]+)_m_([a-z0-9]+)')
    
    a_match = re.compile (r'a4_([0-9a-z]+)_([0-9a-ln-z]+)([_a-z0-9]*)')
    
    a5_match = re.compile (r'a5_([0-9a-z]+)_([a-z]+)_([a-z0-9]+)')
    
    b_match = re.compile (r'b(\d+)_([0-9a-z]+)_([rf])_(b\d+)')
    
    command1="SELECT description from Sequence"
    	    #print command1
    cursor.execute (command1)
    resultset = cursor.fetchall ()
    for name in resultset:
    
        output = ""
                #print result.group(1)
        f_result = f_match.match (name[0])
        if f_result:
            PM_name = "fc1f" + f_result.group(1) + "-es_" + f_result.group(2) + "7." + f_result.group(3)
                    
            PM_name_1= "fc1f" + f_result.group(1) + "-es_" + f_result.group(2) + "1328." + f_result.group(3)
                    
            if PM_name in pms:
                output = PM_name
            elif PM_name_1 in pms:
                output = PM_name_1
                        
                        
            else:
                print "no " + PM_name + "\t" + name[0]
                        
                
                
        a_result_2 = a_match_2.match (name[0])
        if a_result_2:
            PM_name = "anke4-" + a_result_2.group(1) + "_f40.m1328." + a_result_2.group(2)
            
    
                    
            if PM_name in pms:
                output = PM_name
                        
            else:
                print "no " + PM_name + "\t"  + name[0]
                        #print PM_name
                        
                        
        a_result = a_match.match (name[0])
        if a_result:
            PM_name = "anke4-" + a_result.group(1) + "_f40." + a_result.group(2).replace ("_", ".")
            
            if len(a_result.groups()) == 3:
                PM_name = PM_name + a_result.group(3).replace ("_", ".")
                    
            if PM_name in pms:
                output = PM_name
                        
            else:
                print "no " + PM_name + "\t"  + name[0] + " a_result.groups(): " + str(len(a_result.groups()))
                
                        #print PM_name
                
        a5_result = a5_match.match (name[0])
        if a5_result:
            PM_name = "anke5" + a5_result.group(1) + "-es_" + a5_result.group(2) + "1328." +  a5_result.group(3)
                    
            PM_name_1= "anke5" + a5_result.group(1) + "-es_" + a5_result.group(2) + "7." +  a5_result.group(3)
                    
            if PM_name in pms:
                output = PM_name
            elif PM_name_1 in pms:
                output = PM_name_1
                        
            else:
                print "no " + PM_name + "\t"  + name[0]
                            
    
        b_result = b_match.match (name[0])
        if b_result:
            PM_name = "fcb" + b_result.group(1) + "-" + b_result.group(2) + "." + b_result.group(3) + "f40_" + b_result.group(4) + ".SCF"
                    
            if PM_name in pms:
                        
                output = PM_name
            else:
                print "no " + PM_name + "\t"  + name[0]
        
        command2 = "Update Sequence set name = '" + output + "' where description = '" + name[0] + "';"
        #print command2
        cursor.execute(command2)
        
    print "finished!"
        #print output
        #print output.rstrip()
    

    copy | embed

    0 comments - tagged in  posted by dgg32 on Feb 05, 2010 at 9:02 a.m. EST
  • Correct the names from ANME1_Endseqs GenDB project according to original fasta file
    import sys
    import re
    from sets import Set
    
    f = open ('/Users/user/Downloads/BSMallEndseqForHanno.fas')
    
    pms = set()
    
    
    for line in f.readlines(): 
    
        if line.startswith(">"):
            result = line.rstrip().split (" ")
            #print result
            for pm_result in result:
                pms.add(pm_result[1:])
           
            
            
    f.close()
    
    #print pms
    
    
    matchobj = re.compile (r'(\S+):\s+\d+\.\d+')
    
    f_match = re.compile (r'f_(\d+)([a-z]+)_(\w+)')
    
    a_match = re.compile (r'a4_([0-9a-z]+)_(\w+)')
    
    a5_match = re.compile (r'a5_([0-9a-z]+)_([a-z]+)_([a-z0-9]+)')
    
    b_match = re.compile (r'b(\d+)_([0-9a-z]+)_([rf])_(b\d+)')
    
    for line in sys.stdin:
        names = line.split("\t")
    
        output = ""
        for name in names:
            result = matchobj.match (name)
            
            if result:
    
                #print result.group(1)
                f_result = f_match.match (result.group(1))
                if f_result:
                    PM_name = "fc1f" + f_result.group(1) + "-es_" + f_result.group(2) + "7." + f_result.group(3)
                    
                    PM_name_1= "fc1f" + f_result.group(1) + "-es_" + f_result.group(2) + "1328." + f_result.group(3)
                    
                    if PM_name in pms:
                        output = output +  PM_name + "    "
                    elif PM_name_1 in pms:
                        output = output +  PM_name_1 + "    "
                        
                        
                    else:
                        print "no " + PM_name + "\t" + result.group(1)
                        
                
                
                a_result = a_match.match (result.group(1))
                if a_result:
                    PM_name = "anke4-" + a_result.group(1) + "_f40." + a_result.group(2).replace ("_", ".")
                    
                    if PM_name in pms:
                        output = output +  PM_name + "    "
                        
                    else:
                        print "no " + PM_name + "\t"  + result.group(1)
                        #print PM_name
                        
                
                a5_result = a5_match.match (result.group(1))
                if a5_result:
                    PM_name = "anke5" + a5_result.group(1) + "-es_" + a5_result.group(2) + "1328." +  a5_result.group(3)
                    
                    PM_name_1= "anke5" + a5_result.group(1) + "-es_" + a5_result.group(2) + "7." +  a5_result.group(3)
                    
                    if PM_name in pms:
                        output = output +  PM_name + "    "
                    elif PM_name_1 in pms:
                        output = output +  PM_name_1 + "    "
                        
                    else:
                       print "no " + PM_name + "\t"  + result.group(1)
                            
    
                b_result = b_match.match (result.group(1))
                if b_result:
                    PM_name = "fcb" + b_result.group(1) + "-" + b_result.group(2) + "." + b_result.group(3) + "f40_" + b_result.group(4) + ".SCF"
                    
                    if PM_name in pms:
                        
                        output = output +  PM_name + "    "
                    else:
                        print "no " + PM_name + "\t"  + result.group(1)
        print output
        #print output.rstrip()
    

    copy | embed

    0 comments - tagged in  posted by dgg32 on Feb 04, 2010 at 12:44 p.m. EST
  • match my PM list to Hanno's TETRA results
    import sys
    import re
    from sets import Set
    
    f = open ('/Users/user/Downloads/PM-1.txt')
    
    pm_match = re.compile (r'\d+_(\S+)')
    pms = set()
    unique = set()
    
    for line in f.readlines(): 
         pm_result = pm_match.match (line.rstrip())
         
         if pm_result:
            pms.add(pm_result.group(1))
           
            
            
    f.close()
    
    #print pms
    
    matchobj = re.compile (r'(\S+):\s+\d+\.\d+')
    
    f_match = re.compile (r'f_(\d+)[a-z]+_\w+')
    
    a_match = re.compile (r'a4_([0-9a-z]+)_\w+')
    
    for line in sys.stdin:
        names = line.split("\t")
    
        output = ""
        for name in names:
            result = matchobj.match (name)
            
            if result:
    
                #print result.group(1)
                f_result = f_match.match (result.group(1))
                if f_result:
                    PM_name = "fc1f" + f_result.group(1)
                    
                    if PM_name in pms:
                        unique.add(PM_name)          
                
                
                a_result = a_match.match (result.group(1))
                if a_result:
                    PM_name = "anke4-" + a_result.group(1)
                    
                    if PM_name in pms:
                        unique.add(PM_name)
    
    print len(unique)
        #print output.rstrip()
    

    copy | embed

    0 comments - tagged in  posted by dgg32 on Feb 04, 2010 at 10:28 a.m. EST
  • k means from G-Do
    # clustering.py contains classes and functions that cluster data points
    import sys, math, random
    # -- The Point class represents points in n-dimensional space
    class Point:
        # Instance variables
        # self.coords is a list of coordinates for this Point
        # self.n is the number of dimensions this Point lives in (ie, its space)
        # self.reference is an object bound to this Point
        # Initialize new Points
        def __init__(self, coords, reference=None):
            self.coords = coords
            self.n = len(coords)
            self.reference = reference
        # Return a string representation of this Point
        def __repr__(self):
            return str(self.coords)
    # -- The Cluster class represents clusters of points in n-dimensional space
    class Cluster:
        # Instance variables
        # self.points is a list of Points associated with this Cluster
        # self.n is the number of dimensions this Cluster's Points live in
        # self.centroid is the sample mean Point of this Cluster
        def __init__(self, points):
            # We forbid empty Clusters (they don't make mathematical sense!)
            if len(points) == 0: raise Exception("ILLEGAL: EMPTY CLUSTER")
            self.points = points
            self.n = points[0].n
            # We also forbid Clusters containing Points in different spaces
            # Ie, no Clusters with 2D Points and 3D Points
            for p in points:
                if p.n != self.n: raise Exception("ILLEGAL: MULTISPACE CLUSTER")
            # Figure out what the centroid of this Cluster should be
            self.centroid = self.calculateCentroid()
        # Return a string representation of this Cluster
        def __repr__(self):
            return str(self.points)
        # Update function for the K-means algorithm
        # Assigns a new list of Points to this Cluster, returns centroid difference
        def update(self, points):
            old_centroid = self.centroid
            self.points = points
            self.centroid = self.calculateCentroid()
            return getDistance(old_centroid, self.centroid)
        # Calculates the centroid Point - the centroid is the sample mean Point
        # (in plain English, the average of all the Points in the Cluster)
        def calculateCentroid(self):
            centroid_coords = []
            # For each coordinate:
            for i in range(self.n):
                # Take the average across all Points
                centroid_coords.append(0.0)
                for p in self.points:
                    centroid_coords[i] = centroid_coords[i]+p.coords[i]
                centroid_coords[i] = centroid_coords[i]/len(self.points)
            # Return a Point object using the average coordinates
            return Point(centroid_coords)
    # -- Return Clusters of Points formed by K-means clustering
    def kmeans(points, k, cutoff):
        # Randomly sample k Points from the points list, build Clusters around them
        initial = random.sample(points, k)
        clusters = []
        for p in initial: clusters.append(Cluster([p]))
        # Enter the program loop
        while True:
            # Make a list for each Cluster
            lists = []
            for c in clusters: lists.append([])
            # For each Point:
            for p in points:
                # Figure out which Cluster's centroid is the nearest
                smallest_distance = getDistance(p, clusters[0].centroid)
                index = 0
                for i in range(len(clusters[1:])):
                    distance = getDistance(p, clusters[i+1].centroid)
                    if distance < smallest_distance:
                        smallest_distance = distance
                        index = i+1
                # Add this Point to that Cluster's corresponding list
                lists[index].append(p)
            # Update each Cluster with the corresponding list
            # Record the biggest centroid shift for any Cluster
            biggest_shift = 0.0
            for i in range(len(clusters)):
                shift = clusters[i].update(lists[i])
                biggest_shift = max(biggest_shift, shift)
            # If the biggest centroid shift is less than the cutoff, stop
            if biggest_shift < cutoff: break
        # Return the list of Clusters
        return clusters
    # -- Get the Euclidean distance between two Points
    def getDistance(a, b):
        # Forbid measurements between Points in different spaces
        if a.n != b.n: raise Exception("ILLEGAL: NON-COMPARABLE POINTS")
        # Euclidean distance between a and b is sqrt(sum((a[i]-b[i])^2) for all i)
        ret = 0.0
        for i in range(a.n):
            ret = ret+pow((a.coords[i]-b.coords[i]), 2)
        return math.sqrt(ret)
    # -- Create a random Point in n-dimensional space
    def makeRandomPoint(n, lower, upper):
        coords = []
        for i in range(n): coords.append(random.uniform(lower, upper))
        return Point(coords)
    # -- Main function
    def main(args):
        num_points, n, k, cutoff, lower, upper = 10, 2, 3, 0.5, -200, 200
        # Create num_points random Points in n-dimensional space
        points = []
        for i in range(num_points): points.append(makeRandomPoint(n, lower, upper))
        # Cluster the points using the K-means algorithm
        clusters = kmeans(points, k, cutoff)
        # Print the results
        print "\nPOINTS:"
        for p in points: print "P:", p
        print "\nCLUSTERS:"
        for c in clusters: print "C:", c
    # -- The following code executes upon command-line invocation
    if __name__ == "__main__": main(sys.argv)
    

    copy | embed

    0 comments - tagged in  posted by dgg32 on Jan 27, 2010 at 8:51 p.m. EST
  • hmmpfam_parser, all pfam results get imported
    import sys
    import string
    import MySQLdb
    import re
    from collections import defaultdict
    
    
    hostname = sys.argv[1]
    dbname = sys.argv[2]
    username = sys.argv[3]
    password = sys.argv[4]
    
    conn = MySQLdb.connect (host = hostname,
                               user = username,
                               passwd = password,
                               db = dbname)
    
    cursor = conn.cursor ()
    
    
    
    
    
    match_Sequence = re.compile(r'^Query sequence: +(\d+)$')
    
    match_Description = re.compile (r'^(\S+) +(.+) +(\d+\.\d+) +(\S+) +\d+$')
    
    match_domains = re.compile (r'^(\S+) +\S+ +(\d+) +(\d+) +\S+ +(\d+) +(\d+) +\S+ +\S+ +\S+$')
    
    primary_result_set = defaultdict(dict)
    final_result_set = defaultdict(dict)
    
    region_id = "null"
    result_number = 0
    
    for line in sys.stdin:
        
        region_result = match_Sequence.match (line)
        
        if region_result: 
            
            if region_id != "null":
                #print result_number
                for i in range (result_number - 1):
                    command1="INSERT INTO `Observation`(`_id`, `_mtime`, `_ctime`, `target_region`, `deprecated`, `tool_id`, `stop`, `start`, `description`, `region_id`, `_obj_class_id`) VALUES (NULL, NOW(), NOW(), NULL, NULL, '6', '" + final_result_set[i]["stop"] + "', '" + final_result_set[i]["start"] + "', '" + final_result_set[i]["description"] + "', '" + region_id + "', '23');"
    	            #print command1
                    cursor.execute (command1)
    	
    	            cursor.execute ("SELECT LAST_INSERT_ID() FROM Observation")
    	            result_set = cursor.fetchall ()
    	            for row in result_set:
    	                insert_id = row[0]
                #print insert_id
    
                        command2 = "INSERT INTO `Observation_Function_HMMPfam` (`_parent_id`, `domain_stop`, `domain_start`, `model_acc`, `model_name`, `evalue`, `score`) VALUES ('" + str(insert_id) + "', '" + final_result_set[i]["domain_stop"] + "', '" + final_result_set[i]["domain_start"] + "', '"+ final_result_set[i]["model_acc"] + "', '" + final_result_set[i]["model_name"] + "', '" + final_result_set[i]["evalue"] + "', '" + final_result_set[i]["score"] + "');"
                        cursor.execute (command2)
    	        
            primary_result_set = defaultdict(dict)
            final_result_set = defaultdict(dict)
    
            region_id = region_result.group(1)
    
        description_result = match_Description.match(line)
        if description_result:
            model_acc = description_result.group(1)
            primary_result_set[model_acc]["model_acc"] = description_result.group(1)
            primary_result_set[model_acc]["model_name"] = description_result.group(1)
            primary_result_set[model_acc]["description"] = description_result.group(2)
            primary_result_set[model_acc]["score"] = description_result.group(3)
            primary_result_set[model_acc]["evalue"] = description_result.group(4)
            #print result_set[result_number]["evalue"]
            new_region = 0
            
        domain_result = match_domains.match(line)
        if domain_result:
            if new_region == 0:
                new_region = 1
                result_number = 0
            
            model_acc = domain_result.group(1)
            
            final_result_set[result_number]["start"]= domain_result.group(2)
            final_result_set[result_number]["stop"] = domain_result.group(3)
            final_result_set[result_number]["domain_start"] = domain_result.group(4)
            final_result_set[result_number]["domain_stop"] = domain_result.group(5)
            final_result_set[result_number]["model_acc"] = model_acc
            final_result_set[result_number]["model_name"] = model_acc
            final_result_set[result_number]["description"] = primary_result_set[model_acc]["descritpion"]
            final_result_set[result_number]["score"] = primary_result_set[model_acc]["score"]
            final_result_set[model_acc]["evalue"] = primary_result_set[model_acc]["evalue"]
            
            result_number = result_number + 1
            #print result_set[result_number]["domain_start"]
    
     for i in range (result_number - 1):
        command1="INSERT INTO `Observation`(`_id`, `_mtime`, `_ctime`, `target_region`, `deprecated`, `tool_id`, `stop`, `start`, `description`, `region_id`, `_obj_class_id`) VALUES (NULL, NOW(), NOW(), NULL, NULL, '6', '" + final_result_set[i]["stop"] + "', '" + final_result_set[i]["start"] + "', '" + final_result_set[i]["description"] + "', '" + region_id + "', '23');"
    	            #print command1
        cursor.execute (command1)
    	
        cursor.execute ("SELECT LAST_INSERT_ID() FROM Observation")
    	result_set = cursor.fetchall ()
    	for row in result_set:
    	    insert_id = row[0]
            #print insert_id
    
            command2 = "INSERT INTO `Observation_Function_HMMPfam` (`_parent_id`, `domain_stop`, `domain_start`, `model_acc`, `model_name`, `evalue`, `score`) VALUES ('" + str(insert_id) + "', '" + final_result_set[i]["domain_stop"] + "', '" + final_result_set[i]["domain_start"] + "', '"+ final_result_set[i]["model_acc"] + "', '" + final_result_set[i]["model_name"] + "', '" + final_result_set[i]["evalue"] + "', '" + final_result_set[i]["score"] + "');"
            cursor.execute (command2)
    
    
    cursor.close ()
    conn.commit ()
    conn.close ()
    

    copy | embed

    0 comments - tagged in  posted by dgg32 on Jan 14, 2010 at 2:37 p.m. EST
  • calculate the gc of certain contigs in GenDB
    import sys
    import MySQLdb
    
    
    hostname = sys.argv[1]
    dbname = sys.argv[2]
    username = sys.argv[3]
    password = sys.argv[4]
    
    
    
    conn = MySQLdb.connect (host = hostname,
                               user = username,
                               passwd = password,
                               db = dbname)
    
    cursor = conn.cursor ()
    
    
    for line in sys.stdin:
        if (len(line) > 1):
        	command = "SELECT sequence FROM Sequence, Region_Source WHERE Region_Source._parent_id = " + line + " AND Sequence._id = Region_Source.real_sequence_id"
    #	print command
        	cursor.execute (command)
        	result_set = cursor.fetchall()
        	for row in result_set:
    #	    print row[0]
                gc = row[0].count('c') + row[0].count ('C') + row[0].count ('g') + row[0].count('G')
    #	    print gc
    #	    print len(row[0])
    #	    print gc/float(len(row[0]))
                print line.rstrip() + ": " + str(gc/float(len(row[0])) * 100)
    
    cursor.close ()
    conn.commit ()
    conn.close ()
    

    copy | embed

    0 comments - tagged in  posted by dgg32 on Jan 12, 2010 at 6:13 a.m. EST
  • convert decimal int to hex
    import sys
    
    for line in sys.stdin:
        print  "%X" % int(line)
    

    copy | embed

    0 comments - tagged in  posted by dgg32 on Jan 10, 2010 at 6:47 a.m. EST
  • re-establish the mates relation
    import sys
    import MySQLdb
    
    
    hostname = sys.argv[1]
    dbname = sys.argv[2]
    username = sys.argv[3]
    password = sys.argv[4]
    
    
    
    conn = MySQLdb.connect (host = hostname,
                               user = username,
                               passwd = password,
                               db = dbname)
    
    cursor = conn.cursor ()
    
    
    
    Seqs = {}
    
    
    for line in sys.stdin:
        fields = line.rstrip().split("_")
        Seqs[fields[0]] = fields[1] + "_" + fields[2]
    
    for region in Seqs.keys():
    
        for target in Seqs.keys():
            if Seqs[region] == Seqs[target] and region != target:
    	    #print region + ":" + target
    	    command1="INSERT INTO `Silmaril_Mates`(`_parent_region_id`, `region_id`) VALUES ('" + region + "', '" + target + "');"
    	    #print command1
                cursor.execute (command1)
    
    cursor.close ()
    conn.commit ()
    conn.close ()
    

    copy | embed

    0 comments - tagged in  posted by dgg32 on Jan 04, 2010 at 9:16 a.m. EST
  • filter ANME from ANME1_Endseq
    import sys
    
    ANME = set()
    for line in sys.stdin:
    	ANME.add(line.rstrip())
    
    	
    
    input = sys.argv[1]
    f = open (input, 'r')
    printIt = False
    
    
    for line in f.readlines():
    
       if line.find ('>') != -1 and line[1:].rstrip() not in ANME:
    	print line.rstrip()
    	printIt = True
    
       if line.find ('>') == -1 and printIt == True:
    	print line
    	printIt = False
    

    copy | embed

    0 comments - tagged in  posted by dgg32 on Jan 04, 2010 at 5:45 a.m. EST
  • filter the SSAHA2 result
    #to filter the SSAHA2 result like
    #===================================================
    #Matches For Query 20489 (913 bases): b201.1e11.f.b1
    #===================================================
    #Score     Q_Name             S_Name            Q_Start    Q_End  #S_Start    S_End Direction #Bases identity
    #ALIGNMENT::00 730   b201.1e11.f.b1 SC02       15      816    288648    #289465   F     802 97.51 913
    #ALIGNMENT::00 730   b201.1e11.f.b1 SC05       15      816     34214     #35031   F     802 97.51 913
    
    cat '/megx/home/shuang/result/ANME_vs_ANME1_SSAHA2/ANME_vs_ANME1_Endseq.txt' | egrep "ALIGNMENT" | sed 's/::/:/g' | sed 's/ \+/:/g' | cut -d: -f4,12
    

    copy | embed

    0 comments - tagged in  posted by dgg32 on Jan 04, 2010 at 5:10 a.m. EST
  • count how many pdf or chm files are there in endnote
    ls -R -l /Users/sixinghuang/Documents/My\ EndNote\ Library.Data | egrep "\.pdf|\.chm" | wc -l
    

    copy | embed

    0 comments - tagged in  posted by dgg32 on Jan 02, 2010 at 9:17 p.m. EST
  • pfam24 parser + import to GenDB
    import sys
    import string
    import MySQLdb
    import re
    
    
    hostname = sys.argv[1]
    dbname = sys.argv[2]
    username = sys.argv[3]
    password = sys.argv[4]
    
    
    
    conn = MySQLdb.connect (host = hostname,
                               user = username,
                               passwd = password,
                               db = dbname)
    
    cursor = conn.cursor ()
    matchobj = re.compile(r'^(\w+)\ +\S+\ +\d+\ +(\d+)\ +-\ +\d+\ +(\S+)\ +(\S+)\ +\S+\ +\d+\ +\d+\ +\S+\ +\S+\ +\S+\ +\S+\ +(\d+)\ +(\d+)\ +(\d+)\ +(\d+)\ +\d+\ +\d+\ +\S+\ +(.+)$')
    
    for line in sys.stdin:
        
        result = matchobj.match (line)
        if result : 
            
    	command1="INSERT INTO `Observation`(`_id`, `_mtime`, `_ctime`, `target_region`, `deprecated`, `tool_id`, `stop`, `start`, `description`, `region_id`, `_obj_class_id`) VALUES (NULL, NOW(), NOW(), NULL, NULL, '6', '" + result.group(8) + "', '" + result.group(7) + "', '" + result.group(9) + "', '" + result.group(2) + "', '23');"
    	#print command1
            cursor.execute (command1)
    	
    	cursor.execute ("SELECT LAST_INSERT_ID() FROM Observation")
    	result_set = cursor.fetchall ()
    	for row in result_set:
    	    insert_id = row[0]
            #print insert_id
    
            command2 = "INSERT INTO `Observation_Function_HMMPfam` (`_parent_id`, `domain_stop`, `domain_start`, `model_acc`, `model_name`, `evalue`, `score`) VALUES ('" + str(insert_id) + "', '" + result.group(5) + "', '" + result.group(4) + "', '"+ result.group(0) + "', '" + result.group(0) + "', '" + result.group(2) + "', '" + result.group(3) + "');"
            cursor.execute (command2)
            #print command2
    
    cursor.close ()
    conn.commit ()
    conn.close ()
    

    copy | embed

    0 comments - tagged in  posted by dgg32 on Jan 02, 2010 at 3:54 p.m. EST
Sign up to create your own snipts, or login.