1
2
3
4
5
6 """Martel based parser to read SAF formatted files.
7
8 This is a huge regular regular expression for SAF, built using
9 the 'regular expressiona on steroids' capabilities of Martel.
10
11 http://www.embl-heidelberg.de/predictprotein/Dexa/optin_safDes.html
12
13
14 Notes:
15 Just so I remember -- the new end of line syntax is:
16 New regexp syntax - \R
17 \R means "\n|\r\n?"
18 [\R] means "[\n\r]"
19
20 This helps us have endlines be consistent across platforms.
21
22 """
23
24
25
26
27 import Martel
28 from Martel import Str
29 from Martel import AnyEol
30 from Martel import ToEol
31 from Martel import Group
32 from Martel import Alt
33 from Martel import Rep
34 from Martel import Rep1
35 from Martel import Any
36 from Martel import Opt
37 from Martel import ToSep
38 from Martel.Expression import Assert
39
40
41
42
43
44
45
46
47
48
49 digits = "0123456789"
50 valid_sequence_characters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-. \t'
51 white_space = "\t "
52 valid_residue_characters = digits + white_space + chr( 0x2e )
53 residue_number_line = Group( "residue_number_line", \
54 Rep1( Any( valid_residue_characters ) ) +
55 AnyEol())
56 comment_line = Group( "comment_line", \
57 Str( "#" ) +
58 ToEol() )
59 ignored_line = Group( "ignored_line", \
60 Alt( comment_line, residue_number_line ) )
61 candidate_line = Group( "candidate_line", \
62 Assert( Str( "#" ), 1 ) +
63 Assert( Any( valid_residue_characters ), 1 ) +
64 ToSep( sep = ' ' ) +
65 Rep( Any( valid_sequence_characters ) ) +
66 ToEol() )
67 saf_record = Group( "saf_record", \
68 candidate_line + Rep( Alt( candidate_line, ignored_line ) ) + Opt( Str( "#" ) ) )
69