1 """Parser for the SWISS-PROT 38 format.
2
3 You probably want to use the variables 'record' (for a single record)
4 and 'format' (for a set of records).
5
6 """
7 import Martel
8 from Martel import RecordReader
9
15
16
17 ID = Martel.Group("ID", Martel.Re(
18 r"ID (?P<entry_name>\w+) +(?P<data_class_table>\w+); +" \
19 r"(?P<molecule_type>\w+); +(?P<sequence_length>\d+) AA\.\R"
20 ))
21
22
23
24 AC = Martel.Group("AC", Martel.Re(
25 r"AC (?P<ac_number>\w+);( (?P<ac_number>\w+);)*\R"
26 ))
27 AC_block = Martel.Group("AC_block", Martel.Rep1(AC))
28
29
30
31
32 DT_created = Martel.Group("DT_created", Martel.Re(
33 r"DT (?P<day>\d\d)-(?P<month>...)-(?P<year>\d{4}) \(Rel. "\
34 r"(?P<release>\d\d), Created\)\R"
35 ))
36 DT_seq_update = Martel.Group("DT_seq_update", Martel.Re(
37 r"DT (?P<day>\d\d)-(?P<month>...)-(?P<year>\d{4}) \(Rel. "\
38 r"(?P<release>\d\d), Last sequence update\)\R"
39 ))
40 DT_ann_update = Martel.Group("DT_ann_update", Martel.Re(
41 r"DT (?P<day>\d\d)-(?P<month>...)-(?P<year>\d{4}) \(Rel. "\
42 r"(?P<release>\d\d), Last annotation update\)\R"
43 ))
44
45
46
47
48
49 DE = Simple("DE", "description")
50 DE_block = Martel.Group("DE_block", Martel.Rep1(DE))
51
52
53
54
55 GN = Simple("GN", "gene_names")
56 GN_block = Martel.Group("GN_block", Martel.Rep1(GN))
57
58
59
60 OS = Simple("OS", "organism_species")
61 OS_block = Martel.Group("OS_block", Martel.Rep1(OS))
62
63
64
65
66
67 OG = Simple("OG", "organelle")
68 OG_block = Martel.Group("OG_block", Martel.Rep1(OG))
69
70
71
72
73 OC = Simple("OC", "organism_classification")
74 OC_block = Martel.Group("OC_block", Martel.Rep1(OC))
75
76
77
78
79
80
81 RN = Martel.Group("RN", Martel.Re("RN \[(?P<reference_number>\d+)]\R"))
82
83
84
85
86 RP = Simple("RP", "reference_position")
87
88
89
90
91
92 RC = Simple("RC", "reference_comment")
93 RC_block = Martel.Group("RC_block", Martel.Rep1(RC))
94
95
96
97
98 RX = Martel.Group("RX",
99 Martel.Re("RX (?P<bibliographic_database_name>\w+); " \
100 "(?P<bibliographic_identifier>\d+)\.\R"))
101
102
103
104
105 RA = Simple("RA", "reference_author")
106 RA_block = Martel.Group("RA_block", Martel.Rep1(RA))
107
108
109
110
111
112 RT = Simple("RT", "reference_title")
113 RT_block = Martel.Group("RT_block", Martel.Rep1(RT))
114
115
116
117
118
119
120 RL = Simple("RL", "reference_location")
121 RL_block = Martel.Group("RL_block", Martel.Rep1(RL))
122
123 reference = Martel.Group("reference",
124 RN + \
125 RP + \
126 Martel.Opt(RC_block) + \
127 Martel.Opt(RX) + \
128 RA_block + \
129 Martel.Opt(RT_block) + \
130 RL_block
131 )
132
133
134
135
136
137
138 CC_begin = Martel.Group("CC",
139 Martel.Re("CC -!- ") + \
140 Martel.ToEol("comment_text"))
141 CC = Martel.Group("CC",
142 Martel.Re("CC ") + \
143 Martel.ToEol("comment_text"))
144
145 single_comment = Martel.Group("comment",
146 CC_begin +
147 Martel.Rep(CC)
148 )
149
150
151 CC_copyright_begin = Martel.Group("CC_copyright_begin",
152 Martel.Re("CC -+\R"))
153 CC_copyright = Martel.Group("CC_copyright",
154 Martel.Re("CC (?!-+\R)") + \
155 Martel.ToEol("copyright"))
156 CC_copyright_end = Martel.Group("CC_copyright_end",
157 Martel.Re("CC -+\R"))
158
159
160 bogus_DR_group = Martel.Group("bogus_DR_block",
161 Martel.Re(r"(?P<DR>DR (?P<database_identifier>MIM); " \
162 r"(?P<primary_identifier>601385); " \
163 r"(?P<secondary_identifier>-).\R)")
164 )
165
166
167 comment = Martel.Group("comment_block",
168 Martel.Rep(single_comment) + \
169 Martel.Opt(bogus_DR_group) + \
170 Martel.Opt(CC_copyright_begin + \
171 Martel.Rep(CC_copyright) + \
172 CC_copyright_end \
173 )
174 )
175
176
177
178
179
180
181
182 DR_general = Martel.Re("(?P<database_identifier>[^;]+);" \
183 "(?P<primary_identifier>[^;]+); " \
184 "(?P<secondary_identifier>([^.\R]|(?!.\R)\.)+)")
185
186 DR_prosite = Martel.Re("(?P<database_identifier>(PROSITE|PFAM)); " \
187 "(?P<primary_identifier>[^;]+); " \
188 "(?P<secondary_identifier>[^;]+); " \
189 "(?P<status_identifier>[^.]+)")
190
191 DR_embl = Martel.Re("(?P<database_identifier>EMBL); " \
192 "(?P<primary_identifier>[^;]+); " \
193 "(?P<secondary_identifier>[^;]+); " \
194 "(?P<status_identifier>[^.]+)")
195
196 DR = Martel.Group("DR", Martel.Str("DR ") + \
197 Martel.Group("database_reference",
198 Martel.Alt(DR_embl, DR_prosite, DR_general)) + \
199 Martel.Str(".") + Martel.AnyEol())
200
201 DR_block = Martel.Group("DR_block", Martel.Rep1(DR))
202
203
204
205
206
207 KW = Simple("KW", "keyword")
208 KW_block = Martel.Group("KW_block", Martel.Rep1(KW))
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224 FT_range = Martel.Group("FT",
225 Martel.Re("FT (?P<ft_name>.{8}) " \
226 "(?P<ft_from>.{6}) (?P<ft_to>.{6})" \
227 "( (?P<ft_description>[^\R]*))?\R")
228 )
229 FT_continuation = Martel.Group("FT_continuation",
230 Martel.Re("FT " \
231 "(?P<ft_description>[^\R]*)\R")
232 )
233 FT = Martel.Group("feature", FT_range + Martel.Rep(FT_continuation))
234
235 feature_block = Martel.Group("feature_block", Martel.Rep1(FT))
236
237
238
239
240
241
242
243 SQ = Martel.Group("SQ",
244 Martel.Re("SQ SEQUENCE +(?P<sequence_length>\d+) AA;" \
245 " +(?P<molecular_weight>\d+) MW;" \
246 " +(?P<crc32>\w+) CRC32;\R")
247 )
248 SQ_data = Martel.Group("SQ_data",
249 Martel.Re(" (?P<sequence>[^\R]*)\R"))
250
251 sequence = Martel.Group("sequence_block", Martel.Group("SQ_data_block",
252 SQ + Martel.Rep(SQ_data)))
253
254
255
256 end = Martel.Group("END", Martel.Str("//") + Martel.AnyEol())
257
258
259
260 record = Martel.Group("swissprot38_record", \
261 ID + \
262 AC + \
263 DT_created + \
264 DT_seq_update + \
265 DT_ann_update + \
266 Martel.Opt(DE_block) + \
267 Martel.Opt(GN_block) + \
268 Martel.Opt(OS_block) + \
269 Martel.Opt(OG_block) + \
270 Martel.Opt(OC_block) + \
271 Martel.Group("reference_block", Martel.Rep(reference)) + \
272 comment + \
273 Martel.Opt(DR_block) + \
274 Martel.Opt(KW_block) + \
275 Martel.Opt(feature_block) + \
276 sequence + \
277 end
278 )
279
280 format_expression = Martel.Group("swissprot38", Martel.Rep1(record))
281
282 format = Martel.ParseRecords("swissprot38", {}, record,
283 RecordReader.EndsWith, ("//\n",) )
284