Public snipts »
dgg32's
snipts
showing 1-18 of 18 snipts
-
∞ calculate p value to determine if null hypothesis is true
#!/usr/bin/env python import cmath import sys def normsdist (x, mean, standard_dev): res = 0 x = (x -mean) / standard_dev if x == 0: res = 0.5 else: oor2pi = 1 /(cmath.sqrt(2 * 3.14159265358979323846)) t = 1 / (1 + 0.2316419 * abs(x)) t = t * oor2pi * cmath.exp(-0.5 * x * x) * (0.31938153 + t * (-0.356563782 + t * (1.781477937 + t * (-1.821255978 + t * 1.330274429)))) if x >= 0: res = 1 - t else: res = t return res def calcP(Untergewicht_Nichtraucherin, Untergewicht_Raucherin, nicht_Geburten_Nichtraucherin, nicht_Geburten_Raucherin): Total_Nichtraucherin = Untergewicht_Nichtraucherin+ nicht_Geburten_Nichtraucherin Total_Raucherin = Untergewicht_Raucherin + nicht_Geburten_Raucherin Quote_Nichtraucherin = Untergewicht_Nichtraucherin/Total_Nichtraucherin Quote_Raucherin = Untergewicht_Raucherin/Total_Raucherin Summe_Untergewicht = Untergewicht_Nichtraucherin + Untergewicht_Raucherin Summe_nicht_Geburten = nicht_Geburten_Nichtraucherin + nicht_Geburten_Raucherin Summe_Total = Summe_Untergewicht + Summe_nicht_Geburten Quote_Summe = Summe_Untergewicht/Summe_Total Testwert = (Untergewicht_Nichtraucherin*nicht_Geburten_Raucherin-nicht_Geburten_Nichtraucherin*Untergewicht_Raucherin)/cmath.sqrt(Summe_Untergewicht*Summe_nicht_Geburten*Total_Nichtraucherin*Total_Raucherin/Summe_Total) P = 2*normsdist(-abs(Testwert), 0, 1) return P Untergewicht_Nichtraucherin = int(sys.argv[1]) Untergewicht_Raucherin = int(sys.argv[2]) nicht_Geburten_Nichtraucherin = int(sys.argv[3]) nicht_Geburten_Raucherin = int(sys.argv[4]) P = calcP(Untergewicht_Nichtraucherin, Untergewicht_Raucherin, nicht_Geburten_Nichtraucherin, nicht_Geburten_Raucherin) print P if abs(P) < 0.05: print "alternative hypo, p1 != p2" else: print "null hypo, p1 == p2"
-
∞ normsdist, returns the probability that the observed value of a standard normal random variable will be less than or equal to z. A standard normal random variable has mean 0 and standard deviation 1 (and also variance 1 because variance = standard deviation squared).
#!/usr/bin/env python import cmath import sys def normsdist (x, mean, standard_dev): res = 0 x = (x -mean) / standard_dev if x == 0: res = 0.5 else: oor2pi = 1 /(cmath.sqrt(2 * 3.14159265358979323846)) t = 1 / (1 + 0.2316419 * abs(x)) t = t * oor2pi * cmath.exp(-0.5 * x * x) * (0.31938153 + t * (-0.356563782 + t * (1.781477937 + t * (-1.821255978 + t * 1.330274429)))) if x >= 0: res = 1 - t else: res = t return res x = sys.argv[1] print normsdist (float(x), 0, 1)
-
∞ Translate_EMBL
#include <QString> #include <QHash> #include <iostream> #include <QTextStream> void translateThis (QString DNA, int offset, QHash <QString, QString> CodonTable); int main (int agrc, char** argv) { QHash <QString, QString> CodonTable; CodonTable.insert ("TTT", "F"); CodonTable.insert ("TTC", "F"); CodonTable.insert ("TTA", "L"); CodonTable.insert ("TTG", "L"); CodonTable.insert ("CTT", "L"); CodonTable.insert ("CTC", "L"); CodonTable.insert ("CTA", "L"); CodonTable.insert ("CTG", "L"); CodonTable.insert ("ATT", "I"); CodonTable.insert ("ATC", "I"); CodonTable.insert ("ATA", "I"); CodonTable.insert ("ATG", "M"); CodonTable.insert ("GTT", "V"); CodonTable.insert ("GTC", "V"); CodonTable.insert ("GTA", "V"); CodonTable.insert ("GTG", "V"); CodonTable.insert ("TCT", "S"); CodonTable.insert ("TCC", "S"); CodonTable.insert ("TCA", "S"); CodonTable.insert ("TCG", "S"); CodonTable.insert ("CCT", "P"); CodonTable.insert ("CCC", "P"); CodonTable.insert ("CCA", "P"); CodonTable.insert ("CCG", "P"); CodonTable.insert ("ACT", "T"); CodonTable.insert ("ACC", "T"); CodonTable.insert ("ACA", "T"); CodonTable.insert ("ACG", "T"); CodonTable.insert ("GCT", "A"); CodonTable.insert ("GCC", "A"); CodonTable.insert ("GCA", "A"); CodonTable.insert ("GCG", "A"); CodonTable.insert ("TAT", "Y"); CodonTable.insert ("TAC", "Y"); CodonTable.insert ("TAA", "*"); CodonTable.insert ("TAG", "*"); CodonTable.insert ("CAT", "H"); CodonTable.insert ("CAC", "H"); CodonTable.insert ("CAA", "Q"); CodonTable.insert ("CAG", "Q"); CodonTable.insert ("AAT", "N"); CodonTable.insert ("AAC", "N"); CodonTable.insert ("AAA", "K"); CodonTable.insert ("AAG", "K"); CodonTable.insert ("GAT", "D"); CodonTable.insert ("GAC", "D"); CodonTable.insert ("GAA", "E"); CodonTable.insert ("GAG", "E"); CodonTable.insert ("TGT", "C"); CodonTable.insert ("TGC", "C"); CodonTable.insert ("TGA", "*"); CodonTable.insert ("TGG", "W"); CodonTable.insert ("CGT", "R"); CodonTable.insert ("CGC", "R"); CodonTable.insert ("CGA", "R"); CodonTable.insert ("CGG", "R"); CodonTable.insert ("AGT", "S"); CodonTable.insert ("AGC", "S"); CodonTable.insert ("AGA", "R"); CodonTable.insert ("AGG", "R"); CodonTable.insert ("GGT", "G"); CodonTable.insert ("GGC", "G"); CodonTable.insert ("GGA", "G"); CodonTable.insert ("GGG", "G"); QTextStream in (stdin); while (!in.atEnd()) { QString Line = in.readLine(); if (!Line.contains(">")) { translateThis (Line, 0, CodonTable); } } } void translateThis (QString DNA, int offset, QHash <QString, QString> CodonTable) { QString Protein; for (int i = offset-1; i != DNA.length(); i += 3) { QString Aminoacid = CodonTable[DNA.mid(i, 3)]; if (Aminoacid != "*") { Protein += Aminoacid; } else { break; } } }
-
∞ update cdna counts in region_read
#!/usr/bin/env python import sys import string import MySQLdb import re hostname = sys.argv[1] dbname = sys.argv[2] username = sys.argv[3] password = sys.argv[4] conn = MySQLdb.connect (host = hostname, user = username, passwd = password, db = dbname) cursor = conn.cursor () getRead = "SELECT mapped_start, mapped_stop, parent_region_id FROM Silmaril_SSAHA2_cDNA_Mapping" cursor.execute (getRead) resultset = cursor.fetchall() region_reads = {} for result in resultset: contig_id = result[2] start = result[0] stop = result[1] getRegionCDS = "SELECT stop, start, _id FROM Region WHERE parent_region_id = '" + str(contig_id) + "' OR _id = '" + str(contig_id) + "';" cursor.execute (getRegionCDS) regions = cursor.fetchall() for region in regions: region_start = region [1] region_stop = region [0] region_id = region[2] if (max (start, stop) >= min(region_start, region_stop) and max (start, stop) <= max (region_start, region_stop)) or (min (start, stop) >= min (region_start, region_stop) and min(start, stop) <= max (region_start, region_stop)) or (min (start, stop) <= min(region_start, region_stop) and max (start, stop) >= max(region_start, region_stop)): if region_id in region_reads: region_reads [region_id] = region_reads [region_id] + 1 else: region_reads [region_id] = 1 for key in region_reads.keys(): insert = "UPDATE `Region_Read` SET cdna = '" + str(region_reads[key]) + "' WHERE region_id = '" + str(key) + "';" #print insert cursor.execute (insert)
-
∞ count the read amount of contigs and regions
#!/usr/bin/env python import sys import string import MySQLdb import re hostname = sys.argv[1] dbname = sys.argv[2] username = sys.argv[3] password = sys.argv[4] conn = MySQLdb.connect (host = hostname, user = username, passwd = password, db = dbname) cursor = conn.cursor () getRead = "SELECT mapped_start, mapped_stop, region_id FROM Ace2mapable" cursor.execute (getRead) resultset = cursor.fetchall() region_reads = {} for result in resultset: contig_id = result[2] start = result[0] stop = result[1] getRegionCDS = "SELECT stop, start, _id FROM Region WHERE parent_region_id = '" + str(contig_id) + "' OR _id = '" + str(contig_id) + "';" cursor.execute (getRegionCDS) regions = cursor.fetchall() for region in regions: region_start = region [1] region_stop = region [0] region_id = region[2] if (max (region_start, region_stop) >= min (start, stop) and min (region_start, region_stop) <= min (start, stop)) or (max(region_start, region_stop) >= max (start, stop) and min(region_start, region_stop) <= max (start, stop)) or (min (region_start, region_stop) <= min (start, stop) and max (region_start, region_stop) >= max (start, stop)): if region_id in region_reads: region_reads [region_id] = region_reads [region_id] + 1 else: region_reads [region_id] = 1 for key in region_reads.keys(): insert = "INSERT INTO `Region_Read`(`region_id`, `read_count`) VALUES ('" + str(key) + "', '" + str(region_reads[key]) + "';" cursor.execute (insert)
-
∞ detect duplicate names in fasta and give a suggestion
import sys from sets import Set from collections import defaultdict import random def get_optimal (start, length, reverse_Collision): optimal = list () counter = 0 stop = 0 for i in sorted(reverse_Collision): if stop == 1: break for j in sorted(reverse_Collision[i]): if counter >= start: optimal.append (j) #print reverse_Collision[i][j] counter = counter + 1 if (counter-start) == Length: stop = 1 break return optimal Trial = 2 Length = 7 Collision = dict() Collection = [] allNames = list() for line in sys.stdin: allNames.append (line.rstrip()) for i in range(len(line.rstrip())): if len(Collection) == i: #print "first if " + line [i] Collection.append(Set()) Collection[i].add(line[i]) Collision[i] = 0 else: #print "first else " + line [i] if line[i] in Collection[i]: Collision[i] = Collision[i] + 1 else: Collection[i].add (line[i]) reverse_Collision = defaultdict(list) for key in Collision: #print key reverse_Collision[Collision[key]].append(key) #print Collision[key] #print reverse_Collision fail = 2 start = 0 taken = Set() optimal = list() while fail != 0: optimal = get_optimal (start, Length, reverse_Collision) times = 0 if (len(taken) + 1) >= len(allNames): fail = 0 #print "len(allNames) - 1 " + str(len(allNames) - 1) if fail != 0: index = random.randint(0, len(allNames) - 1) while index in taken: index = random.randint(0, len(allNames) - 1) taken.add (index) sample = "" for i in range(len(optimal)): sample = sample + allNames[index][optimal[i]] #print sample for i in range (len(allNames)): if fail == 1: output = "" for k in range(len(optimal)): output = output + " " + str(optimal[k]) print "Positions" + output + " fail." break if i not in taken: times = times + 1 target = "" for j in range(len(optimal)): target = target + allNames[i][optimal[j]] if sample == target: print "Collision found" fail = 1 start = start + 1 break print times if times == Trial: fail = 0 print "Try " + str(times) + " times. No collision found!" output = "" if fail == 0: for k in range(len(optimal)): output = output + " " + str(optimal[k]) print "I suggest positions " + output + "." #print i #print sorted_Collision_values
-
∞ correct the names in ANME1Endseqs GenDB project
import sys import re from sets import Set import MySQLdb hostname = sys.argv[1] dbname = sys.argv[2] username = sys.argv[3] password = sys.argv[4] conn = MySQLdb.connect (host = hostname, user = username, passwd = password, db = dbname) cursor = conn.cursor () f = open ('/megx/home/shuang/sequences/BSMallEndseqForHanno.fas') pms = set() for line in f.readlines(): if line.startswith(">"): result = line.rstrip().split (" ") #print result for pm_result in result: pms.add(pm_result[1:]) f.close() #print pms f_match = re.compile (r'f_(\d+)([a-z]+)_(\w+)') a_match_2 = re.compile (r'a4_([0-9a-z]+)_m_([a-z0-9]+)') a_match = re.compile (r'a4_([0-9a-z]+)_([0-9a-ln-z]+)([_a-z0-9]*)') a5_match = re.compile (r'a5_([0-9a-z]+)_([a-z]+)_([a-z0-9]+)') b_match = re.compile (r'b(\d+)_([0-9a-z]+)_([rf])_(b\d+)') command1="SELECT description from Sequence" #print command1 cursor.execute (command1) resultset = cursor.fetchall () for name in resultset: output = "" #print result.group(1) f_result = f_match.match (name[0]) if f_result: PM_name = "fc1f" + f_result.group(1) + "-es_" + f_result.group(2) + "7." + f_result.group(3) PM_name_1= "fc1f" + f_result.group(1) + "-es_" + f_result.group(2) + "1328." + f_result.group(3) if PM_name in pms: output = PM_name elif PM_name_1 in pms: output = PM_name_1 else: print "no " + PM_name + "\t" + name[0] a_result_2 = a_match_2.match (name[0]) if a_result_2: PM_name = "anke4-" + a_result_2.group(1) + "_f40.m1328." + a_result_2.group(2) if PM_name in pms: output = PM_name else: print "no " + PM_name + "\t" + name[0] #print PM_name a_result = a_match.match (name[0]) if a_result: PM_name = "anke4-" + a_result.group(1) + "_f40." + a_result.group(2).replace ("_", ".") if len(a_result.groups()) == 3: PM_name = PM_name + a_result.group(3).replace ("_", ".") if PM_name in pms: output = PM_name else: print "no " + PM_name + "\t" + name[0] + " a_result.groups(): " + str(len(a_result.groups())) #print PM_name a5_result = a5_match.match (name[0]) if a5_result: PM_name = "anke5" + a5_result.group(1) + "-es_" + a5_result.group(2) + "1328." + a5_result.group(3) PM_name_1= "anke5" + a5_result.group(1) + "-es_" + a5_result.group(2) + "7." + a5_result.group(3) if PM_name in pms: output = PM_name elif PM_name_1 in pms: output = PM_name_1 else: print "no " + PM_name + "\t" + name[0] b_result = b_match.match (name[0]) if b_result: PM_name = "fcb" + b_result.group(1) + "-" + b_result.group(2) + "." + b_result.group(3) + "f40_" + b_result.group(4) + ".SCF" if PM_name in pms: output = PM_name else: print "no " + PM_name + "\t" + name[0] command2 = "Update Sequence set name = '" + output + "' where description = '" + name[0] + "';" #print command2 cursor.execute(command2) print "finished!" #print output #print output.rstrip()
-
∞ Correct the names from ANME1_Endseqs GenDB project according to original fasta file
import sys import re from sets import Set f = open ('/Users/user/Downloads/BSMallEndseqForHanno.fas') pms = set() for line in f.readlines(): if line.startswith(">"): result = line.rstrip().split (" ") #print result for pm_result in result: pms.add(pm_result[1:]) f.close() #print pms matchobj = re.compile (r'(\S+):\s+\d+\.\d+') f_match = re.compile (r'f_(\d+)([a-z]+)_(\w+)') a_match = re.compile (r'a4_([0-9a-z]+)_(\w+)') a5_match = re.compile (r'a5_([0-9a-z]+)_([a-z]+)_([a-z0-9]+)') b_match = re.compile (r'b(\d+)_([0-9a-z]+)_([rf])_(b\d+)') for line in sys.stdin: names = line.split("\t") output = "" for name in names: result = matchobj.match (name) if result: #print result.group(1) f_result = f_match.match (result.group(1)) if f_result: PM_name = "fc1f" + f_result.group(1) + "-es_" + f_result.group(2) + "7." + f_result.group(3) PM_name_1= "fc1f" + f_result.group(1) + "-es_" + f_result.group(2) + "1328." + f_result.group(3) if PM_name in pms: output = output + PM_name + " " elif PM_name_1 in pms: output = output + PM_name_1 + " " else: print "no " + PM_name + "\t" + result.group(1) a_result = a_match.match (result.group(1)) if a_result: PM_name = "anke4-" + a_result.group(1) + "_f40." + a_result.group(2).replace ("_", ".") if PM_name in pms: output = output + PM_name + " " else: print "no " + PM_name + "\t" + result.group(1) #print PM_name a5_result = a5_match.match (result.group(1)) if a5_result: PM_name = "anke5" + a5_result.group(1) + "-es_" + a5_result.group(2) + "1328." + a5_result.group(3) PM_name_1= "anke5" + a5_result.group(1) + "-es_" + a5_result.group(2) + "7." + a5_result.group(3) if PM_name in pms: output = output + PM_name + " " elif PM_name_1 in pms: output = output + PM_name_1 + " " else: print "no " + PM_name + "\t" + result.group(1) b_result = b_match.match (result.group(1)) if b_result: PM_name = "fcb" + b_result.group(1) + "-" + b_result.group(2) + "." + b_result.group(3) + "f40_" + b_result.group(4) + ".SCF" if PM_name in pms: output = output + PM_name + " " else: print "no " + PM_name + "\t" + result.group(1) print output #print output.rstrip()
-
∞ match my PM list to Hanno's TETRA results
import sys import re from sets import Set f = open ('/Users/user/Downloads/PM-1.txt') pm_match = re.compile (r'\d+_(\S+)') pms = set() unique = set() for line in f.readlines(): pm_result = pm_match.match (line.rstrip()) if pm_result: pms.add(pm_result.group(1)) f.close() #print pms matchobj = re.compile (r'(\S+):\s+\d+\.\d+') f_match = re.compile (r'f_(\d+)[a-z]+_\w+') a_match = re.compile (r'a4_([0-9a-z]+)_\w+') for line in sys.stdin: names = line.split("\t") output = "" for name in names: result = matchobj.match (name) if result: #print result.group(1) f_result = f_match.match (result.group(1)) if f_result: PM_name = "fc1f" + f_result.group(1) if PM_name in pms: unique.add(PM_name) a_result = a_match.match (result.group(1)) if a_result: PM_name = "anke4-" + a_result.group(1) if PM_name in pms: unique.add(PM_name) print len(unique) #print output.rstrip()
-
∞ k means from G-Do
# clustering.py contains classes and functions that cluster data points import sys, math, random # -- The Point class represents points in n-dimensional space class Point: # Instance variables # self.coords is a list of coordinates for this Point # self.n is the number of dimensions this Point lives in (ie, its space) # self.reference is an object bound to this Point # Initialize new Points def __init__(self, coords, reference=None): self.coords = coords self.n = len(coords) self.reference = reference # Return a string representation of this Point def __repr__(self): return str(self.coords) # -- The Cluster class represents clusters of points in n-dimensional space class Cluster: # Instance variables # self.points is a list of Points associated with this Cluster # self.n is the number of dimensions this Cluster's Points live in # self.centroid is the sample mean Point of this Cluster def __init__(self, points): # We forbid empty Clusters (they don't make mathematical sense!) if len(points) == 0: raise Exception("ILLEGAL: EMPTY CLUSTER") self.points = points self.n = points[0].n # We also forbid Clusters containing Points in different spaces # Ie, no Clusters with 2D Points and 3D Points for p in points: if p.n != self.n: raise Exception("ILLEGAL: MULTISPACE CLUSTER") # Figure out what the centroid of this Cluster should be self.centroid = self.calculateCentroid() # Return a string representation of this Cluster def __repr__(self): return str(self.points) # Update function for the K-means algorithm # Assigns a new list of Points to this Cluster, returns centroid difference def update(self, points): old_centroid = self.centroid self.points = points self.centroid = self.calculateCentroid() return getDistance(old_centroid, self.centroid) # Calculates the centroid Point - the centroid is the sample mean Point # (in plain English, the average of all the Points in the Cluster) def calculateCentroid(self): centroid_coords = [] # For each coordinate: for i in range(self.n): # Take the average across all Points centroid_coords.append(0.0) for p in self.points: centroid_coords[i] = centroid_coords[i]+p.coords[i] centroid_coords[i] = centroid_coords[i]/len(self.points) # Return a Point object using the average coordinates return Point(centroid_coords) # -- Return Clusters of Points formed by K-means clustering def kmeans(points, k, cutoff): # Randomly sample k Points from the points list, build Clusters around them initial = random.sample(points, k) clusters = [] for p in initial: clusters.append(Cluster([p])) # Enter the program loop while True: # Make a list for each Cluster lists = [] for c in clusters: lists.append([]) # For each Point: for p in points: # Figure out which Cluster's centroid is the nearest smallest_distance = getDistance(p, clusters[0].centroid) index = 0 for i in range(len(clusters[1:])): distance = getDistance(p, clusters[i+1].centroid) if distance < smallest_distance: smallest_distance = distance index = i+1 # Add this Point to that Cluster's corresponding list lists[index].append(p) # Update each Cluster with the corresponding list # Record the biggest centroid shift for any Cluster biggest_shift = 0.0 for i in range(len(clusters)): shift = clusters[i].update(lists[i]) biggest_shift = max(biggest_shift, shift) # If the biggest centroid shift is less than the cutoff, stop if biggest_shift < cutoff: break # Return the list of Clusters return clusters # -- Get the Euclidean distance between two Points def getDistance(a, b): # Forbid measurements between Points in different spaces if a.n != b.n: raise Exception("ILLEGAL: NON-COMPARABLE POINTS") # Euclidean distance between a and b is sqrt(sum((a[i]-b[i])^2) for all i) ret = 0.0 for i in range(a.n): ret = ret+pow((a.coords[i]-b.coords[i]), 2) return math.sqrt(ret) # -- Create a random Point in n-dimensional space def makeRandomPoint(n, lower, upper): coords = [] for i in range(n): coords.append(random.uniform(lower, upper)) return Point(coords) # -- Main function def main(args): num_points, n, k, cutoff, lower, upper = 10, 2, 3, 0.5, -200, 200 # Create num_points random Points in n-dimensional space points = [] for i in range(num_points): points.append(makeRandomPoint(n, lower, upper)) # Cluster the points using the K-means algorithm clusters = kmeans(points, k, cutoff) # Print the results print "\nPOINTS:" for p in points: print "P:", p print "\nCLUSTERS:" for c in clusters: print "C:", c # -- The following code executes upon command-line invocation if __name__ == "__main__": main(sys.argv)
-
∞ hmmpfam_parser, all pfam results get imported
import sys import string import MySQLdb import re from collections import defaultdict hostname = sys.argv[1] dbname = sys.argv[2] username = sys.argv[3] password = sys.argv[4] conn = MySQLdb.connect (host = hostname, user = username, passwd = password, db = dbname) cursor = conn.cursor () match_Sequence = re.compile(r'^Query sequence: +(\d+)$') match_Description = re.compile (r'^(\S+) +(.+) +(\d+\.\d+) +(\S+) +\d+$') match_domains = re.compile (r'^(\S+) +\S+ +(\d+) +(\d+) +\S+ +(\d+) +(\d+) +\S+ +\S+ +\S+$') primary_result_set = defaultdict(dict) final_result_set = defaultdict(dict) region_id = "null" result_number = 0 for line in sys.stdin: region_result = match_Sequence.match (line) if region_result: if region_id != "null": #print result_number for i in range (result_number - 1): command1="INSERT INTO `Observation`(`_id`, `_mtime`, `_ctime`, `target_region`, `deprecated`, `tool_id`, `stop`, `start`, `description`, `region_id`, `_obj_class_id`) VALUES (NULL, NOW(), NOW(), NULL, NULL, '6', '" + final_result_set[i]["stop"] + "', '" + final_result_set[i]["start"] + "', '" + final_result_set[i]["description"] + "', '" + region_id + "', '23');" #print command1 cursor.execute (command1) cursor.execute ("SELECT LAST_INSERT_ID() FROM Observation") result_set = cursor.fetchall () for row in result_set: insert_id = row[0] #print insert_id command2 = "INSERT INTO `Observation_Function_HMMPfam` (`_parent_id`, `domain_stop`, `domain_start`, `model_acc`, `model_name`, `evalue`, `score`) VALUES ('" + str(insert_id) + "', '" + final_result_set[i]["domain_stop"] + "', '" + final_result_set[i]["domain_start"] + "', '"+ final_result_set[i]["model_acc"] + "', '" + final_result_set[i]["model_name"] + "', '" + final_result_set[i]["evalue"] + "', '" + final_result_set[i]["score"] + "');" cursor.execute (command2) primary_result_set = defaultdict(dict) final_result_set = defaultdict(dict) region_id = region_result.group(1) description_result = match_Description.match(line) if description_result: model_acc = description_result.group(1) primary_result_set[model_acc]["model_acc"] = description_result.group(1) primary_result_set[model_acc]["model_name"] = description_result.group(1) primary_result_set[model_acc]["description"] = description_result.group(2) primary_result_set[model_acc]["score"] = description_result.group(3) primary_result_set[model_acc]["evalue"] = description_result.group(4) #print result_set[result_number]["evalue"] new_region = 0 domain_result = match_domains.match(line) if domain_result: if new_region == 0: new_region = 1 result_number = 0 model_acc = domain_result.group(1) final_result_set[result_number]["start"]= domain_result.group(2) final_result_set[result_number]["stop"] = domain_result.group(3) final_result_set[result_number]["domain_start"] = domain_result.group(4) final_result_set[result_number]["domain_stop"] = domain_result.group(5) final_result_set[result_number]["model_acc"] = model_acc final_result_set[result_number]["model_name"] = model_acc final_result_set[result_number]["description"] = primary_result_set[model_acc]["descritpion"] final_result_set[result_number]["score"] = primary_result_set[model_acc]["score"] final_result_set[model_acc]["evalue"] = primary_result_set[model_acc]["evalue"] result_number = result_number + 1 #print result_set[result_number]["domain_start"] for i in range (result_number - 1): command1="INSERT INTO `Observation`(`_id`, `_mtime`, `_ctime`, `target_region`, `deprecated`, `tool_id`, `stop`, `start`, `description`, `region_id`, `_obj_class_id`) VALUES (NULL, NOW(), NOW(), NULL, NULL, '6', '" + final_result_set[i]["stop"] + "', '" + final_result_set[i]["start"] + "', '" + final_result_set[i]["description"] + "', '" + region_id + "', '23');" #print command1 cursor.execute (command1) cursor.execute ("SELECT LAST_INSERT_ID() FROM Observation") result_set = cursor.fetchall () for row in result_set: insert_id = row[0] #print insert_id command2 = "INSERT INTO `Observation_Function_HMMPfam` (`_parent_id`, `domain_stop`, `domain_start`, `model_acc`, `model_name`, `evalue`, `score`) VALUES ('" + str(insert_id) + "', '" + final_result_set[i]["domain_stop"] + "', '" + final_result_set[i]["domain_start"] + "', '"+ final_result_set[i]["model_acc"] + "', '" + final_result_set[i]["model_name"] + "', '" + final_result_set[i]["evalue"] + "', '" + final_result_set[i]["score"] + "');" cursor.execute (command2) cursor.close () conn.commit () conn.close ()
-
∞ calculate the gc of certain contigs in GenDB
import sys import MySQLdb hostname = sys.argv[1] dbname = sys.argv[2] username = sys.argv[3] password = sys.argv[4] conn = MySQLdb.connect (host = hostname, user = username, passwd = password, db = dbname) cursor = conn.cursor () for line in sys.stdin: if (len(line) > 1): command = "SELECT sequence FROM Sequence, Region_Source WHERE Region_Source._parent_id = " + line + " AND Sequence._id = Region_Source.real_sequence_id" # print command cursor.execute (command) result_set = cursor.fetchall() for row in result_set: # print row[0] gc = row[0].count('c') + row[0].count ('C') + row[0].count ('g') + row[0].count('G') # print gc # print len(row[0]) # print gc/float(len(row[0])) print line.rstrip() + ": " + str(gc/float(len(row[0])) * 100) cursor.close () conn.commit () conn.close ()
-
∞ convert decimal int to hex
import sys for line in sys.stdin: print "%X" % int(line)
-
∞ re-establish the mates relation
import sys import MySQLdb hostname = sys.argv[1] dbname = sys.argv[2] username = sys.argv[3] password = sys.argv[4] conn = MySQLdb.connect (host = hostname, user = username, passwd = password, db = dbname) cursor = conn.cursor () Seqs = {} for line in sys.stdin: fields = line.rstrip().split("_") Seqs[fields[0]] = fields[1] + "_" + fields[2] for region in Seqs.keys(): for target in Seqs.keys(): if Seqs[region] == Seqs[target] and region != target: #print region + ":" + target command1="INSERT INTO `Silmaril_Mates`(`_parent_region_id`, `region_id`) VALUES ('" + region + "', '" + target + "');" #print command1 cursor.execute (command1) cursor.close () conn.commit () conn.close ()
-
∞ filter ANME from ANME1_Endseq
import sys ANME = set() for line in sys.stdin: ANME.add(line.rstrip()) input = sys.argv[1] f = open (input, 'r') printIt = False for line in f.readlines(): if line.find ('>') != -1 and line[1:].rstrip() not in ANME: print line.rstrip() printIt = True if line.find ('>') == -1 and printIt == True: print line printIt = False
-
∞ filter the SSAHA2 result
#to filter the SSAHA2 result like #=================================================== #Matches For Query 20489 (913 bases): b201.1e11.f.b1 #=================================================== #Score Q_Name S_Name Q_Start Q_End #S_Start S_End Direction #Bases identity #ALIGNMENT::00 730 b201.1e11.f.b1 SC02 15 816 288648 #289465 F 802 97.51 913 #ALIGNMENT::00 730 b201.1e11.f.b1 SC05 15 816 34214 #35031 F 802 97.51 913 cat '/megx/home/shuang/result/ANME_vs_ANME1_SSAHA2/ANME_vs_ANME1_Endseq.txt' | egrep "ALIGNMENT" | sed 's/::/:/g' | sed 's/ \+/:/g' | cut -d: -f4,12
-
∞ count how many pdf or chm files are there in endnote
ls -R -l /Users/sixinghuang/Documents/My\ EndNote\ Library.Data | egrep "\.pdf|\.chm" | wc -l
-
∞ pfam24 parser + import to GenDB
import sys import string import MySQLdb import re hostname = sys.argv[1] dbname = sys.argv[2] username = sys.argv[3] password = sys.argv[4] conn = MySQLdb.connect (host = hostname, user = username, passwd = password, db = dbname) cursor = conn.cursor () matchobj = re.compile(r'^(\w+)\ +\S+\ +\d+\ +(\d+)\ +-\ +\d+\ +(\S+)\ +(\S+)\ +\S+\ +\d+\ +\d+\ +\S+\ +\S+\ +\S+\ +\S+\ +(\d+)\ +(\d+)\ +(\d+)\ +(\d+)\ +\d+\ +\d+\ +\S+\ +(.+)$') for line in sys.stdin: result = matchobj.match (line) if result : command1="INSERT INTO `Observation`(`_id`, `_mtime`, `_ctime`, `target_region`, `deprecated`, `tool_id`, `stop`, `start`, `description`, `region_id`, `_obj_class_id`) VALUES (NULL, NOW(), NOW(), NULL, NULL, '6', '" + result.group(8) + "', '" + result.group(7) + "', '" + result.group(9) + "', '" + result.group(2) + "', '23');" #print command1 cursor.execute (command1) cursor.execute ("SELECT LAST_INSERT_ID() FROM Observation") result_set = cursor.fetchall () for row in result_set: insert_id = row[0] #print insert_id command2 = "INSERT INTO `Observation_Function_HMMPfam` (`_parent_id`, `domain_stop`, `domain_start`, `model_acc`, `model_name`, `evalue`, `score`) VALUES ('" + str(insert_id) + "', '" + result.group(5) + "', '" + result.group(4) + "', '"+ result.group(0) + "', '" + result.group(0) + "', '" + result.group(2) + "', '" + result.group(3) + "');" cursor.execute (command2) #print command2 cursor.close () conn.commit () conn.close ()



Python in a Nutshell, Second Edition