forked from openworm/owmeta
-
Notifications
You must be signed in to change notification settings - Fork 0
/
lineage.py
156 lines (130 loc) · 4.15 KB
/
lineage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from PyOpenWorm import Data,DefaultConfig
from xlrd import open_workbook
from rdflib import Literal, URIRef, Graph,Namespace,RDFS
import re
import httplib as H
from itertools import chain
from os import getcwd
# Read in from the spreadsheet
# leading substring
# matched names in first column go in an array
# Assert that the development name is actually unique
def read(n,sheet_number,cols,start=1):
rb = open_workbook(n)
for row in range(start,rb.sheet_by_index(sheet_number).nrows):
l = []
for i in range(cols):
l.append(str(rb.sheet_by_index(sheet_number).cell(row,i).value))
yield l
# Replace spaces with dots
postembryonic_regex = re.compile(r"^([A-Z0-9]+)\.([a-z]+)$")
embryonic_regex = re.compile(r"^([A-Z0-9]+) ([a-z]+)$")
goodname_regex = re.compile(r"^([A-Z0-9]+)(:?[. ]([a-z]+))?$")
nospace_regex = re.compile(r"^([A-Z0-9]+)([a-z]+)$")
# expression for
def normalize_lineage_name(name):
n = str(name)
if "," in n:
parts = n.split(",")
if len(parts) > 0:
n = parts[0]
n = n.strip()
# find the starting substring with capitals and ensure there's a space after
m = re.match(nospace_regex, n)
if m:
n = str(m.group(1)) +" "+ str(m.group(2))
return n
def normalize(s):
for i in s:
n = normalize_lineage_name(i)
yield n
def urlize(s,ns):
s = s.replace(" ", "_")
return ns[s]
def bad_names(names):
for n in names:
if not re.match(goodname_regex,n):
yield n
def good_names(names):
for n in names:
if re.match(goodname_regex,n):
yield n
def filter_lineage_slash(i):
for k in i:
if '/' in k[1]:
yield k
def triple_adult_dev_mapping():
sheet = read("lineage.xls",sheet_number=2, cols=3, start=2)
for r in sheet:
yield (r[0], "development_name", r[1])
def triple_dev_tree():
sheet = read("lineage.xls",sheet_number=1, cols=6, start=2)
for r in sheet:
yield (r[0], "daughter_of", r[4])
#def missing_mappings():
#a = set([r[0] for r in triple_dev_tree()])
#a |= set([r[2] for r in triple_dev_tree()])
#b = set([r[2] for r in triple_adult_dev_mapping()])
#return (a - b, b - a)
def subject(s):
for i in s:
yield i[0]
def object(s):
for i in s:
yield i[2]
def smap_o(s,f):
m = f(object(s))
for i in zip(s,m):
i[2] = m
yield i
def all_bad_names():
collector = set([])
names = chain(subject(triple_dev_tree()), object(triple_dev_tree()), object(triple_adult_dev_mapping()))
for p in bad_names(normalize(names)):
collector.add(p)
return collector
def dev_bad_names():
collector = set([])
names = chain(object(triple_dev_tree()),subject(triple_dev_tree()))
for p in bad_names(normalize(names)):
collector.add(p)
return collector
def put_in_sesame(graph):
s = graph.serialize(format="n3")
con = H.HTTPConnection("107.170.133.175:8080")
con.request("POST", "/openrdf-sesame/repositories/OpenWorm2/statements", s, {"Content-Type": "application/x-turtle;charset=UTF-8"})
r = con.getresponse()
print "sesame response is %d " % r.status
class D:
namespace = Namespace("http://openworm.org/entities/")
d = D()
def f(i):
return urlize(normalize_lineage_name(i),d.namespace)
def tree_graph():
graph = Graph()
for i in ((f(x[0]), d.namespace[x[1]], f(x[2])) for x in triple_dev_tree()):
graph.add(i)
return graph
def adult_dev_graph():
graph = Graph()
def j():
for x in triple_adult_dev_mapping():
if re.match(goodname_regex,x[2]):
yield (f(x[2]), RDFS["label"], Literal(str(x[2])))
yield (f(x[2]), RDFS["label"], Literal(str(x[2])))
for i in j():
graph.add(i)
return graph
def upload_tree():
put_in_sesame(tree_graph())
def upload_adult_dev_mapping():
put_in_sesame(adult_dev_graph())
def print_serialization(g):
d = Data(DefaultConfig)
print g.serialize(format="nt")
if __name__ == "__main__":
#print_serialization(adult_dev_graph())
for x in subject(triple_dev_tree()):
print x
#for x in dev_bad_names():
#print x