1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 """Functions for parsing, processing, and formatting Python structured text.
18
19 See "Structured Text Formatting
20 Rules":http://www.python.org/sigs/doc-sig/stext.html for more
21 information.
22 """
23
24
25
26
27
28 import cStringIO
29 import htmlentitydefs
30 import re
31 import string
32 import sys
33
34
35
36
37
38 html_help_text = '''
39 <h4>Structured Text</h4>
40
41 <p>Structured text is a simple set of conventions for formatting
42 ordinary text. Usually, you can simply type ordinary text where QM
43 expects structured text; the resulting output will be line-wrapped, with
44 paragraph breaks indicated by blank lines.</p>
45
46 <p>Structured text also provides simple ways of adding elements such as
47 bulleted and numbered lists, bold and italics text, monospaced text, and
48 hyperlinks to ordinary text. For example, to obtain bold text, place a
49 pair of asterisks on either side of it. Thus,
50 <blockquote><pre>
51 **hello, there**
52 </pre></blockquote>
53 is displayed as
54 <blockquote>
55 <b>hello, there</b>
56 </blockquote>Structured text may be displayed as HTML, as plain text,
57 or in other formats. The exact format of the output will depend on the
58 capabilities of the output system.</p>
59
60 <p>For a complete description of structured text rules, see the
61 <a href="http://www.python.org/sigs/doc-sig/stext.html">Structured Text
62 Formatting Rules</a>.</p>
63 '''
64
65
66
67
68
70 """Interface for output formatters for the 'StructuredTextProcessor'.
71
72 Valid list environment types are
73
74 * definition list
75
76 * ordered list
77
78 * paragraph
79
80 * unordered list
81
82 Valid styles are
83
84 * emphasized
85
86 * strong
87
88 * underlined
89
90 * literal
91
92 * verbatim
93
94 """
95
96 pass
97
98
99
101 """Formatter for generating plain text from structured text."""
102
103 __style_markers = {
104 "emphasized" : "*",
105 "strong" : "**",
106 "underlined" : "_",
107 "literal" : "'",
108 "verbatim" : "'''",
109 }
110
111
112 - def __init__(self,
113 output_file=sys.stdout,
114 width=78,
115 indent_size=2,
116 indent=0,
117 list_bullet="-"):
118 """Create a new HTML formatter.
119
120 'output_file' -- A file object to which HTML source is
121 written."""
122
123 self.__output_file = output_file
124 self.__width = width
125 self.__col = 0
126 self.__indent = indent
127 self.__indent_size = indent_size
128 self.__list_bullet = list_bullet
129 self.__list_depth = 0
130 self.__current_link_target = None
131 self.__link_targets = []
132
133
135 """End the processed text document."""
136
137
138
139 if self.__link_targets:
140 self.__NextLine()
141 for index in range(0, len(self.__link_targets)):
142
143
144 target = self.__link_targets[index]
145 self.WriteText("[%d] %s" % (index + 1, target))
146 self.__NextLine()
147
148
150 """Write ordinary text."""
151
152
153
154
155 words = re.split("( )", text)
156
157 words = filter(None, words)
158
159 start_of_line = 0
160 for word in words:
161
162 if self.__col + len(word) > self.__width:
163
164 self.__NextLine()
165 self.__IndentTo(self.__indent)
166 start_of_line = 1
167
168 if start_of_line:
169 if string.strip(word) == "":
170
171 continue
172 else:
173
174 start_of_line = 0
175
176 self.__Write(word)
177
178
180 """Start a list environment of type 'type'."""
181
182
183
184 if type == "paragraph" and self.__list_depth > 0:
185 self.__indent = self.__indent + self.__indent_size
186
187 self.__list_depth = self.__list_depth + 1
188
189
191 """End a list environment of type 'type'."""
192
193
194 self.__list_depth = self.__list_depth - 1
195
196
197 if type == "paragraph" and self.__list_depth > 0:
198 self.__indent = self.__indent - self.__indent_size
199
200
202 """Begin an element to the environment of type 'type'.
203
204 'label' -- If type is "ordered list", this is the label for
205 this list element."""
206
207 self.__IndentTo(self.__indent)
208
209 if type == "ordered list":
210 self.__Write("%s " % label)
211 elif type == "unordered list":
212 self.__Write("%s " % self.__list_bullet)
213 elif type == "definition list":
214 pass
215
216
218 """Finish the definition of a term in a definition list."""
219
220 self.__Write(" -- ");
221
222
224 """End an element in the environment of type 'type'."""
225
226 if type == "paragraph":
227
228
229 if self.__col > self.__indent:
230 self.__NextLine()
231
232 self.__NextLine()
233
234
239
240
245
246
248 """Being a hyperlink to 'target'."""
249
250
251 assert self.__current_link_target is None
252
253
254 self.__current_link_target = target
255
256
258 """End a hyperlink."""
259
260
261
262 target = self.__current_link_target
263 assert target is not None
264 self.__current_link_target = None
265
266
267 try:
268 reference_number = self.__link_targets.index(target) + 1
269 except ValueError:
270
271
272
273 self.__link_targets.append(target)
274 reference_number = len(self.__link_targets)
275
276 self.__Write(" [%d]" % reference_number)
277
278
279
280
282 if col > self.__col:
283 self.__Write(" " * (col - self.__col))
284
285
287 self.__output_file.write(text)
288 self.__col = self.__col + len(text)
289
290
292 self.__Write("\n")
293 self.__col = 0
294
295
296
432
433
434
436 """Parser and formatter for Python structured text."""
437
438
439 __punctuation = "[%s]" % "][)(.,!?;:'\" "
440
441
442 __bullet_regex = re.compile("^[-o*] +")
443
444
445 __sequence_regex = re.compile("^([A-Za-z]+\.|[0-9]+\.?)+ +")
446
447
448
449 __definition_regex = re.compile("^(.*) +-- +")
450
451
452 __collapse_regex = re.compile(" *\n *", re.MULTILINE)
453
454
455 __indent_regex = re.compile("^ *")
456
457
458
459
460 __literal_regex = re.compile("( +|^)'([^']+)'(%s+|$)" % __punctuation)
461
462
463
464
465 __strong_regex = re.compile("( +|^)\*\*([^*]+)\*\*(%s+|$)" % __punctuation)
466
467
468
469
470 __emph_regex = re.compile("( +|^)\*([^*]+)\*(%s+|$)" % __punctuation)
471
472
473
474
475 __underline_regex = re.compile("( +|^)_([^_]+)_(%s+|$)" % __punctuation)
476
477
478
479
480 __link_regex = re.compile('"([^"]*)"')
481
482
483
484 __link_footnote_regex = re.compile('\n\\.\\. *"([^"]*)" *([^ \n]*)[^\n]*')
485
486
487
488 __non_nestable_types = [
489 "paragraph",
490 ]
491
492
493 - def __init__(self, formatter):
494 """Create a new structured text processor.
495
496 'formatter' -- The formatter to use to generate output."""
497
498 self.__stack = []
499 self.__formatter = formatter
500 self.__hyperlinks = {}
501
502
503 - def NormalizeSpaces(self, text):
504 """Return 'text' with spaces normalized."""
505
506
507 text = string.replace(text, "\t", " ")
508
509 return string.strip(text) + " "
510
511
512 - def __call__(self, text):
513 """Process structured text 'text'."""
514
515
516
517 position = 0
518 while position < len(text):
519
520 match = self.__link_footnote_regex.search(text[position:])
521 if match is None:
522
523 break
524 else:
525
526 link_text = string.strip(match.group(1))
527 link_target = match.group(2)
528 self.__hyperlinks[link_text] = link_target
529
530 text = text[:match.start() + position] \
531 + text[match.end() + position:]
532
533 position = match.start()
534
535
536 paragraphs = get_paragraphs(text)
537
538
539 for paragraph in paragraphs:
540
541 match = _verbatim_regexp.match(paragraph)
542 if match:
543 if self.__stack:
544 indentation = self.__stack[-1][1]
545 else:
546 indentation = 0
547 self.__SetType("verbatim", indentation)
548 self.__formatter.StartStyle("verbatim")
549 self.__formatter.WriteText(match.group(1)[3:-3])
550 self.__formatter.EndStyle("verbatim")
551 continue
552
553 indents = self.__indent_regex.findall(paragraph)
554
555
556 indentation = min(map(len, indents))
557
558 paragraph = paragraph[indentation:]
559
560
561 if paragraph == "":
562 continue
563
564
565 first_line = string.split(paragraph, "\n", 1)[0]
566
567
568 match = self.__bullet_regex.match(first_line)
569 if match is not None:
570
571
572 self.__SetType("unordered list", indentation)
573
574
575 match_length = len(match.group(0))
576 indentation = indentation + match_length
577 paragraph = paragraph[match_length:]
578 else:
579
580 match = self.__sequence_regex.match(first_line)
581 if match is not None:
582
583
584 self.__SetType("ordered list", indentation,
585 label=match.group(1))
586
587
588 match_length = len(match.group(0))
589 indentation = indentation + match_length
590 paragraph = paragraph[match_length:]
591 else:
592 match = self.__definition_regex.match(first_line)
593
594 if match is not None:
595
596
597 self.__SetType("definition list", indentation,
598 label=match.group(1))
599
600
601 match_length = len(match.group(0))
602 indentation = indentation + match_length
603 paragraph = paragraph[match_length:]
604
605
606
607 paragraph = self.__collapse_regex.sub(" ", paragraph)
608
609 paragraph = self.NormalizeSpaces(paragraph)
610
611 self.__SetType("paragraph", indentation)
612 self.__WriteText(paragraph)
613
614
616 """Stop processing text, and do any necessary cleanup."""
617
618
619 while self.__stack:
620 top_type, top_indentation = self.__stack[-1]
621
622 self.__formatter.EndItem(top_type)
623
624 self.__PopType()
625
626 self.__formatter.End()
627
628
629
630
631 - def __PushType(self, type, indentation):
632 """Start a new environment."""
633
634
635
636
637 if len(self.__stack) > 0:
638 top_type, top_indentation = self.__stack[-1]
639 if top_type in self.__non_nestable_types:
640 self.__formatter.EndItem(top_type)
641
642 self.__formatter.StartList(type)
643
644 self.__stack.append((type, indentation))
645
646
647 - def __PopType(self):
648 """End and remove the innermost environment."""
649
650
651 top_type, top_indentation = self.__stack[-1]
652
653 self.__formatter.EndList(top_type)
654
655 self.__stack.pop()
656
657
658
659
660 if len(self.__stack) > 0:
661 top_type, top_indentation = self.__stack[-1]
662 if top_type in self.__non_nestable_types:
663 self.__formatter.StartItem(top_type)
664
665
666 - def __SetType(self, type, indentation, label=None):
667 """Set the environment type and indentation level."""
668
669 while 1:
670
671
672 if len(self.__stack) == 0:
673 top_indentation = -1
674 else:
675 top_type, top_indentation = self.__stack[-1]
676
677
678
679 if indentation <= top_indentation:
680
681 self.__formatter.EndItem(top_type)
682 if indentation < top_indentation:
683
684 self.__PopType()
685 elif top_type != type:
686
687
688
689 self.__PopType()
690 self.__PushType(type, indentation)
691 else:
692
693
694 break
695 else:
696
697
698 self.__PushType(type, indentation)
699 break
700
701
702 self.__formatter.StartItem(type, label)
703 if type == "definition list":
704 self.__WriteText(label)
705 self.__formatter.FinishDefinedTerm()
706
707
708 - def __WriteText(self, text):
709 """Write paragraph text."""
710
711
712
713 for regex, style in [
714 (self.__literal_regex, "literal"),
715 (self.__strong_regex, "strong"),
716 (self.__emph_regex, "emphasized"),
717 (self.__underline_regex, "underlined"),
718 ]:
719
720 match = regex.search(text)
721 if match is not None:
722
723
724 self.__WriteText(text[:match.end(1)])
725
726 self.__formatter.StartStyle(style)
727
728
729 if style == "literal" or style == "verbatim":
730 self.__formatter.WriteText(match.group(2))
731 else:
732 self.__WriteText(match.group(2))
733
734 self.__formatter.EndStyle(style)
735
736 self.__WriteText(text[match.start(3):])
737 return
738
739
740 match = self.__link_regex.search(text)
741 if match is not None:
742 link_text = string.strip(match.group(1))
743
744
745 if self.__hyperlinks.has_key(link_text):
746
747 link_target = self.__hyperlinks[link_text]
748
749
750 self.__WriteText(text[:match.start(0)])
751
752 self.__formatter.StartLink(link_target)
753
754 self.__WriteText(match.group(1))
755
756 self.__formatter.EndLink()
757
758 self.__WriteText(text[match.end(1) + 1:])
759 return
760 else:
761
762 pass
763
764
765 self.__formatter.WriteText(text)
766
767
768
769
770
771
776
777
784
785
796
797
798 -def to_text(structured_text, width=78, indent=0):
799 """Return 'structured_text' formatted as plain text.
800
801 'width' -- The width of the text (including the indentation).
802
803 'indent' -- The width of the block indentation of the formatted
804 output."""
805
806
807 output_string = cStringIO.StringIO()
808 formatter = TextFormatter(output_string, width=width, indent=indent)
809
810 __format(structured_text, formatter)
811
812 return output_string.getvalue()
813
814
816 """Return the first line of 'structured_text'.
817
818 By convention, the first line of a structured text description is a
819 short summary."""
820
821 return string.split(structured_text, "\n", 1)[0]
822
823
825 """Return the contents of 'structured_text' minus the first line."""
826
827 parts = string.split(structured_text, "\n", 1)
828
829 if len(parts) > 0:
830 return parts[1]
831 else:
832 return ""
833
834
836 """Split 'structured_text' into paragraphs.
837
838 'structured_text' -- A string consisting of structured text.
839
840 returns -- A sequence of pagraphs of structured text. Each
841 element in the sequence corresponds to a successive pagraph
842 in the 'structured_text'. If 'structured_text' is the empty
843 string, the sequence returned will consist of a single
844 paragraph, itself empty."""
845
846
847 paragraphs = []
848
849 begin = 0
850
851 end = 0
852
853 while end < len(structured_text):
854
855
856 if (len(structured_text) - end >= 6
857 and structured_text[end:end+3] == "'''"):
858 end = string.find(structured_text, "'''", end + 3)
859 if end > 0:
860 end = end + 3
861
862 paragraphs.append(structured_text[begin:end])
863 begin = end
864 continue
865 else:
866
867
868 while end < len(structured_text):
869
870 match = __paragraph_regexp.match(structured_text, end)
871 if match:
872
873 paragraphs.append(structured_text[begin:end])
874
875
876 begin = match.end()
877 end = begin
878 break
879 else:
880
881 end = end + 1
882
883
884 if begin != end:
885 paragraphs.append(structured_text[begin:end])
886
887 return paragraphs
888
889
891 """Return the first paragraph of 'structured_text'.
892
893 'structured_text' -- A string consisting of structured text.
894
895 returns -- A string of structured text that is the first paragraph
896 of the 'structured_text'."""
897
898 return get_paragraphs(structured_text)[0]
899
900
901
902
903
904
905
906 __entity_char_regex = htmlentitydefs.entitydefs.values()
907
908 __entity_char_regex = filter(lambda l: len(l) == 1, __entity_char_regex)
909 __entity_char_regex = "[" + string.join(__entity_char_regex, "") + "]"
910 __entity_char_regex = re.compile(__entity_char_regex)
911
912
913
914
915 __entity_char_replacement = {}
916 for entity, character in htmlentitydefs.entitydefs.items():
917 if len(character) == 1:
918 __entity_char_replacement[character] = "&%s;" % entity
919
920
921 __entity_char_replacement = lambda match, \
922 replacement_map=__entity_char_replacement: \
923 replacement_map[match.group(0)]
924
925
926 __paragraph_regexp = re.compile("(?:\n *)+\n")
927
928
929
930 _verbatim_regexp = re.compile("('''.*''')(?:(?:\n *)+\n|\n?$)", re.DOTALL)
931
932
933
934
935
936
937
938 if __name__ == "__main__":
939
940 import getopt
941 long_options = [
942 "html",
943 "text",
944 ]
945 options, arguments = getopt.getopt(sys.argv[1:], "", long_options)
946
947 formatter = None
948 for option, option_argument in options:
949 if option == "--html":
950 formatter = HtmlFormatter()
951 elif option == "--text":
952 formatter = TextFormatter()
953
954 if formatter is None:
955 formatter = TextFormatter()
956
957
958 processor = StructuredTextProcessor(formatter)
959
960
961 if len(arguments) == 0:
962
963 inputs = (sys.stdin, )
964 else:
965
966 inputs = map(lambda file_name: open(file_name, "rt"), arguments)
967
968
969 for input in inputs:
970
971 processor(input.read())
972
973
974 processor.End()
975
976
977 sys.exit(0)
978
979
980
981
982
983
984
985
986