Package qm :: Module structured_text
[hide private]
[frames] | no frames]

Source Code for Module qm.structured_text

  1  #!/usr/bin/python 
  2  ######################################################################## 
  3  # 
  4  # File:   structured_text.py 
  5  # Author: Alex Samuel 
  6  # Date:   2001-03-04 
  7  # 
  8  # Contents: 
  9  #   Code for processing structured text. 
 10  # 
 11  # Copyright (c) 2001, 2002 by CodeSourcery, LLC.  All rights reserved.  
 12  # 
 13  # For license terms see the file COPYING. 
 14  # 
 15  ######################################################################## 
 16   
 17  """Functions for parsing, processing, and formatting Python structured text. 
 18   
 19  See "Structured Text Formatting 
 20  Rules":http://www.python.org/sigs/doc-sig/stext.html for more 
 21  information. 
 22  """ 
 23   
 24  ######################################################################## 
 25  # imports 
 26  ######################################################################## 
 27   
 28  import cStringIO 
 29  import htmlentitydefs 
 30  import re 
 31  import string 
 32  import sys 
 33   
 34  ######################################################################## 
 35  # constants 
 36  ######################################################################## 
 37   
 38  html_help_text = ''' 
 39  <h4>Structured Text</h4> 
 40   
 41  <p>Structured text is a simple set of conventions for formatting 
 42  ordinary text.  Usually, you can simply type ordinary text where QM 
 43  expects structured text; the resulting output will be line-wrapped, with 
 44  paragraph breaks indicated by blank lines.</p> 
 45   
 46  <p>Structured text also provides simple ways of adding elements such as 
 47  bulleted and numbered lists, bold and italics text, monospaced text, and 
 48  hyperlinks to ordinary text.  For example, to obtain bold text, place a 
 49  pair of asterisks on either side of it.  Thus, 
 50  <blockquote><pre> 
 51  **hello, there** 
 52  </pre></blockquote> 
 53  is displayed as 
 54  <blockquote> 
 55  <b>hello, there</b> 
 56  </blockquote>Structured text may be displayed as HTML, as plain text, 
 57  or in other formats.  The exact format of the output will depend on the 
 58  capabilities of the output system.</p> 
 59   
 60  <p>For a complete description of structured text rules, see the 
 61  <a href="http://www.python.org/sigs/doc-sig/stext.html">Structured Text 
 62  Formatting Rules</a>.</p> 
 63  ''' 
 64   
 65  ######################################################################## 
 66  # classes 
 67  ######################################################################## 
 68   
69 -class Formatter:
70 """Interface for output formatters for the 'StructuredTextProcessor'. 71 72 Valid list environment types are 73 74 * definition list 75 76 * ordered list 77 78 * paragraph 79 80 * unordered list 81 82 Valid styles are 83 84 * emphasized 85 86 * strong 87 88 * underlined 89 90 * literal 91 92 * verbatim 93 94 """ 95 96 pass
97 98 99
100 -class TextFormatter(Formatter):
101 """Formatter for generating plain text from structured text.""" 102 103 __style_markers = { 104 "emphasized" : "*", 105 "strong" : "**", 106 "underlined" : "_", 107 "literal" : "'", 108 "verbatim" : "'''", 109 } 110 111
112 - def __init__(self, 113 output_file=sys.stdout, 114 width=78, 115 indent_size=2, 116 indent=0, 117 list_bullet="-"):
118 """Create a new HTML formatter. 119 120 'output_file' -- A file object to which HTML source is 121 written.""" 122 123 self.__output_file = output_file 124 self.__width = width 125 self.__col = 0 126 self.__indent = indent 127 self.__indent_size = indent_size 128 self.__list_bullet = list_bullet 129 self.__list_depth = 0 130 self.__current_link_target = None 131 self.__link_targets = []
132 133
134 - def End(self):
135 """End the processed text document.""" 136 137 # If there were any hyperlink references placed, we need to list 138 # the link targets at the end of the document. 139 if self.__link_targets: 140 self.__NextLine() 141 for index in range(0, len(self.__link_targets)): 142 # Print the reference number and link target, one to a 143 # line. 144 target = self.__link_targets[index] 145 self.WriteText("[%d] %s" % (index + 1, target)) 146 self.__NextLine()
147 148
149 - def WriteText(self, text):
150 """Write ordinary text.""" 151 152 # Split the text into words. Use 're.split' and grouping 153 # around the separator so that the resulting list contains 154 # elements for the separators, too. 155 words = re.split("( )", text) 156 # Remove empty strings. 157 words = filter(None, words) 158 # Loop over words. 159 start_of_line = 0 160 for word in words: 161 # Does this word fit on the line? 162 if self.__col + len(word) > self.__width: 163 # No. Go to the next line. 164 self.__NextLine() 165 self.__IndentTo(self.__indent) 166 start_of_line = 1 167 # Are we at the beginning of a line? 168 if start_of_line: 169 if string.strip(word) == "": 170 # Don't print spaces at the start of a line. 171 continue 172 else: 173 # No longer. 174 start_of_line = 0 175 # Write the word. 176 self.__Write(word)
177 178
179 - def StartList(self, type):
180 """Start a list environment of type 'type'.""" 181 182 # Bump up indentation for paragraphs, except for the outermost 183 # level. 184 if type == "paragraph" and self.__list_depth > 0: 185 self.__indent = self.__indent + self.__indent_size 186 # Keep track of the nesting depth of lists. 187 self.__list_depth = self.__list_depth + 1
188 189
190 - def EndList(self, type):
191 """End a list environment of type 'type'.""" 192 193 # Keep track of the nesting depth of lists. 194 self.__list_depth = self.__list_depth - 1 195 # Bump back indentation when ending paragraph lists, except for 196 # the outermost level. 197 if type == "paragraph" and self.__list_depth > 0: 198 self.__indent = self.__indent - self.__indent_size
199 200
201 - def StartItem(self, type, label=None):
202 """Begin an element to the environment of type 'type'. 203 204 'label' -- If type is "ordered list", this is the label for 205 this list element.""" 206 207 self.__IndentTo(self.__indent) 208 # For list items, emit the appopriate item tag. 209 if type == "ordered list": 210 self.__Write("%s " % label) 211 elif type == "unordered list": 212 self.__Write("%s " % self.__list_bullet) 213 elif type == "definition list": 214 pass
215 216
217 - def FinishDefinedTerm(self):
218 """Finish the definition of a term in a definition list.""" 219 220 self.__Write(" -- ");
221 222
223 - def EndItem(self, type):
224 """End an element in the environment of type 'type'.""" 225 226 if type == "paragraph": 227 # End a paragraph. End this line if we've started writing 228 # on it. 229 if self.__col > self.__indent: 230 self.__NextLine() 231 # Skip another line. 232 self.__NextLine()
233 234
235 - def StartStyle(self, style):
236 """Start a new text style 'style'.""" 237 238 self.__Write(self.__style_markers[style])
239 240
241 - def EndStyle(self, style):
242 """End the text style 'style'.""" 243 244 self.__Write(self.__style_markers[style])
245 246 255 256 277 278 279 # Helper methods. 280
281 - def __IndentTo(self, col):
282 if col > self.__col: 283 self.__Write(" " * (col - self.__col))
284 285
286 - def __Write(self, text):
287 self.__output_file.write(text) 288 self.__col = self.__col + len(text)
289 290
291 - def __NextLine(self):
292 self.__Write("\n") 293 self.__col = 0
294 295 296
297 -class HtmlFormatter(Formatter):
298 """Formatter for generating HTML from structured text.""" 299 300 __start_list_tags = { 301 "definition list": "<dl>\n", 302 "ordered list": "<ol>\n", 303 "paragraph": "", 304 "unordered list": "<ul>\n", 305 "verbatim": "", 306 } 307 308 __end_list_tags = { 309 "definition list": "</dl>\n", 310 "ordered list": "</ol>\n", 311 "paragraph": "", 312 "unordered list": "</ul>\n", 313 "verbatim": "", 314 } 315 316 __start_item_tags = { 317 "definition list": "<dt>", 318 "ordered list": "<li>\n", 319 "paragraph": "<p>", 320 "unordered list": "<li>\n", 321 "verbatim": "", 322 } 323 324 __end_item_tags = { 325 "definition list": "</dd>\n", 326 "ordered list": "</li>\n", 327 "paragraph": "</p>\n", 328 "unordered list": "</li>\n", 329 "verbatim": "", 330 } 331 332 __start_style_tags = { 333 "emphasized": "<em>", 334 "strong": "<strong>", 335 "underlined": "<u>", 336 "literal": "<tt>", 337 "verbatim": '<pre>\'<span class="verbatim">', 338 } 339 340 __end_style_tags = { 341 "emphasized": "</em>", 342 "strong": "</strong>", 343 "underlined": "</u>", 344 "literal": "</tt>", 345 "verbatim": '</span>\'</pre>', 346 } 347 348
349 - def __init__(self, output_file=sys.stdout):
350 """Create a new HTML formatter. 351 352 'output_file' -- A file object to which HTML source is 353 written.""" 354 355 self.__output_file = output_file
356 357
358 - def End(self):
359 """End the processed text document.""" 360 361 pass
362 363
364 - def WriteText(self, text):
365 """Write ordinary text.""" 366 367 text = escape_html_entities(text) 368 self.__Write(text)
369 370
371 - def StartList(self, type):
372 """Start a list environment of type 'type'.""" 373 374 self.__Write(self.__start_list_tags[type])
375 376
377 - def EndList(self, type):
378 """End a list environment of type 'type'.""" 379 380 self.__Write(self.__end_list_tags[type])
381 382
383 - def StartItem(self, type, label=None):
384 """Begin an element to the environment of type 'type'. 385 386 'label' -- If type is "ordered list", this is the label for 387 this list element.""" 388 389 self.__Write(self.__start_item_tags[type])
390 391
392 - def FinishDefinedTerm(self):
393 """Finish the definition of a term in a definition list.""" 394 395 self.__Write("</dt><dd>\n");
396 397
398 - def EndItem(self, type):
399 """End an element in the environment of type 'type'.""" 400 401 self.__Write(self.__end_item_tags[type])
402 403
404 - def StartStyle(self, style):
405 """Start a new text style 'style'.""" 406 407 self.__Write(self.__start_style_tags[style])
408 409
410 - def EndStyle(self, style):
411 """End the text style 'style'.""" 412 413 self.__Write(self.__end_style_tags[style])
414 415 420 421 426 427 428 # Helper methods. 429
430 - def __Write(self, text):
431 self.__output_file.write(text)
432 433 434
436 """Parser and formatter for Python structured text.""" 437 438 # Regex fragment matching a single punctuation or space character. 439 __punctuation = "[%s]" % "][)(.,!?;:'\" " 440 441 # Regex matching a list bullet at the start of the line. 442 __bullet_regex = re.compile("^[-o*] +") 443 444 # Regex matching a sequence label at the start of the line. 445 __sequence_regex = re.compile("^([A-Za-z]+\.|[0-9]+\.?)+ +") 446 447 # Regex matching a definition label at the start of the line. 448 # Group 1 is the defined term. 449 __definition_regex = re.compile("^(.*) +-- +") 450 451 # Regex matching newslines plus any spaces on either side. 452 __collapse_regex = re.compile(" *\n *", re.MULTILINE) 453 454 # Regex matching indentation at the beginning of a line. 455 __indent_regex = re.compile("^ *") 456 457 # Regex matching single-quoted literal text. Group 1 is leading 458 # spaces; group 2 is the verbatim text; group 3 is trailing spaces 459 # and/or punctuation. 460 __literal_regex = re.compile("( +|^)'([^']+)'(%s+|$)" % __punctuation) 461 462 # Regex matching emphasized text. Group 1 is leading spaces; 463 # group 2 is the verbatim text; group 3 is trailing spaces and/or 464 # punctuation. 465 __strong_regex = re.compile("( +|^)\*\*([^*]+)\*\*(%s+|$)" % __punctuation) 466 467 # Regex matching strong text. Group 1 is leading spaces; group 2 468 # is the verbatim text; group 3 is trailing spaces and/or 469 # punctuation. 470 __emph_regex = re.compile("( +|^)\*([^*]+)\*(%s+|$)" % __punctuation) 471 472 # Regex matching underlined text. Group 1 is leading spaces; 473 # group 2 is the verbatim text; group 3 is trailing spaces and/or 474 # punctuation. 475 __underline_regex = re.compile("( +|^)_([^_]+)_(%s+|$)" % __punctuation) 476 477 # Regex matching double-quoted text that may be a hyperlink. If 478 # there is a matching link footnote, the contents of the double 479 # quotes, group 1, is a hyperlink. 480 __link_regex = re.compile('"([^"]*)"') 481 482 # Regex matching hyperlink footnotes. Group one is the link text; 483 # group 2 is the link target URL. 484 __link_footnote_regex = re.compile('\n\\.\\. *"([^"]*)" *([^ \n]*)[^\n]*') 485 486 # List types which may not include other environments nested 487 # inside their items. 488 __non_nestable_types = [ 489 "paragraph", 490 ] 491 492
493 - def __init__(self, formatter):
494 """Create a new structured text processor. 495 496 'formatter' -- The formatter to use to generate output.""" 497 498 self.__stack = [] 499 self.__formatter = formatter 500 self.__hyperlinks = {}
501 502
503 - def NormalizeSpaces(self, text):
504 """Return 'text' with spaces normalized.""" 505 506 # Convert tabs to spaces. 507 text = string.replace(text, "\t", " ") 508 # Normalize leading and trailing whitespace. 509 return string.strip(text) + " "
510 511
512 - def __call__(self, text):
513 """Process structured text 'text'.""" 514 515 # Look for hyperlink footnotes, and build a map of hyperlinked 516 # phrases. Keep track of where the last match was. 517 position = 0 518 while position < len(text): 519 # Look for the next hyperlink footnote match. 520 match = self.__link_footnote_regex.search(text[position:]) 521 if match is None: 522 # No more; all done. 523 break 524 else: 525 # Record the hyperlink. 526 link_text = string.strip(match.group(1)) 527 link_target = match.group(2) 528 self.__hyperlinks[link_text] = link_target 529 # Remove the footnote from the text. 530 text = text[:match.start() + position] \ 531 + text[match.end() + position:] 532 # Next, try searching from the text following the match. 533 position = match.start() 534 535 # Split text into paragraphs. 536 paragraphs = get_paragraphs(text) 537 538 # Loop over paragraphs. 539 for paragraph in paragraphs: 540 # If this is a verbatim paragraph, handle it specially. 541 match = _verbatim_regexp.match(paragraph) 542 if match: 543 if self.__stack: 544 indentation = self.__stack[-1][1] 545 else: 546 indentation = 0 547 self.__SetType("verbatim", indentation) 548 self.__formatter.StartStyle("verbatim") 549 self.__formatter.WriteText(match.group(1)[3:-3]) 550 self.__formatter.EndStyle("verbatim") 551 continue 552 # Extract indentations for all the lines in the paragraph. 553 indents = self.__indent_regex.findall(paragraph) 554 # The paragraph's indentation is the minimum indentation 555 # of its lines. 556 indentation = min(map(len, indents)) 557 # Trim indentation from the first line. 558 paragraph = paragraph[indentation:] 559 560 # Skip empty paragraphs. 561 if paragraph == "": 562 continue 563 564 # Grab the first line of the paragraph. 565 first_line = string.split(paragraph, "\n", 1)[0] 566 567 # Does it look like a bullet (unordered) list item? 568 match = self.__bullet_regex.match(first_line) 569 if match is not None: 570 # Yes. Put the formatter into an unordered list 571 # environment. 572 self.__SetType("unordered list", indentation) 573 # Cut off the bullet, and use the indentation of the 574 # text itself. 575 match_length = len(match.group(0)) 576 indentation = indentation + match_length 577 paragraph = paragraph[match_length:] 578 else: 579 # Does it look like a sequence label of an ordered list? 580 match = self.__sequence_regex.match(first_line) 581 if match is not None: 582 # Yes. Put the formatter into an ordered list 583 # environment. 584 self.__SetType("ordered list", indentation, 585 label=match.group(1)) 586 # Cut off the label, and use the indentation of 587 # the text itself. 588 match_length = len(match.group(0)) 589 indentation = indentation + match_length 590 paragraph = paragraph[match_length:] 591 else: 592 match = self.__definition_regex.match(first_line) 593 # Does it look like a definition list item? 594 if match is not None: 595 # Yes. Put the formatter into a definition 596 # list environment. 597 self.__SetType("definition list", indentation, 598 label=match.group(1)) 599 # Cut off the defined term label, and use the 600 # indentation of the definition. 601 match_length = len(match.group(0)) 602 indentation = indentation + match_length 603 paragraph = paragraph[match_length:] 604 605 # Collapse the remaining paragraph into a single line of 606 # text by replacing newlines with spaces. 607 paragraph = self.__collapse_regex.sub(" ", paragraph) 608 # Clean up spacing. 609 paragraph = self.NormalizeSpaces(paragraph) 610 # Now generate a paragraph for the rest of the text. 611 self.__SetType("paragraph", indentation) 612 self.__WriteText(paragraph)
613 614
615 - def End(self):
616 """Stop processing text, and do any necessary cleanup.""" 617 618 # Pop out of any remaining environments. 619 while self.__stack: 620 top_type, top_indentation = self.__stack[-1] 621 # End the item. 622 self.__formatter.EndItem(top_type) 623 # End the environment. 624 self.__PopType() 625 # Finish up the formatter. 626 self.__formatter.End()
627 628 629 # Helper methods. 630
631 - def __PushType(self, type, indentation):
632 """Start a new environment.""" 633 634 # The innermost environment may be of a type that cannot 635 # contain nested environments in its items. If that's the 636 # case, end the item here. 637 if len(self.__stack) > 0: 638 top_type, top_indentation = self.__stack[-1] 639 if top_type in self.__non_nestable_types: 640 self.__formatter.EndItem(top_type) 641 # Start te environment. 642 self.__formatter.StartList(type) 643 # Push it onto the stack. 644 self.__stack.append((type, indentation))
645 646
647 - def __PopType(self):
648 """End and remove the innermost environment.""" 649 650 # Get the topmost environment on the stack. 651 top_type, top_indentation = self.__stack[-1] 652 # End the environment. 653 self.__formatter.EndList(top_type) 654 # Remove it from the stack. 655 self.__stack.pop() 656 # The new innermost environment may be of a type that cannot 657 # contain nested environments. If it is, then we 658 # (prematurely) ended an item when we opened the environment 659 # that just closed. We'll have to open a new item here. 660 if len(self.__stack) > 0: 661 top_type, top_indentation = self.__stack[-1] 662 if top_type in self.__non_nestable_types: 663 self.__formatter.StartItem(top_type)
664 665
666 - def __SetType(self, type, indentation, label=None):
667 """Set the environment type and indentation level.""" 668 669 while 1: 670 # Look at the current innermost environment (if there is 671 # eone). 672 if len(self.__stack) == 0: 673 top_indentation = -1 674 else: 675 top_type, top_indentation = self.__stack[-1] 676 677 # Are we outdented from the current environment and 678 # indentation level, or at the same indentation? 679 if indentation <= top_indentation: 680 # End the previous item. 681 self.__formatter.EndItem(top_type) 682 if indentation < top_indentation: 683 # We're outdented, so end the previous environment. 684 self.__PopType() 685 elif top_type != type: 686 # Same indentation but different environment type. 687 # End the previous environment, and start a new 688 # one. 689 self.__PopType() 690 self.__PushType(type, indentation) 691 else: 692 # Same indentation, same environment. We just 693 # need a new item, so fall through. 694 break 695 else: 696 # We're indented. Nest a new environment in the 697 # current item. 698 self.__PushType(type, indentation) 699 break 700 701 # Start a new item in the current environment. 702 self.__formatter.StartItem(type, label) 703 if type == "definition list": 704 self.__WriteText(label) 705 self.__formatter.FinishDefinedTerm()
706 707
708 - def __WriteText(self, text):
709 """Write paragraph text.""" 710 711 # Look for various types of markup for special formatting for 712 # a range of text. 713 for regex, style in [ 714 (self.__literal_regex, "literal"), 715 (self.__strong_regex, "strong"), 716 (self.__emph_regex, "emphasized"), 717 (self.__underline_regex, "underlined"), 718 ]: 719 # Find the first match. 720 match = regex.search(text) 721 if match is not None: 722 # Found a match. Recursively format everything up to 723 # the start of the match. 724 self.__WriteText(text[:match.end(1)]) 725 # Start generating text in the indicated style. 726 self.__formatter.StartStyle(style) 727 # If it's a literal style, push the literal text out 728 # directly. Otherwise, format it recursively. 729 if style == "literal" or style == "verbatim": 730 self.__formatter.WriteText(match.group(2)) 731 else: 732 self.__WriteText(match.group(2)) 733 # Stop generating text in the specified style. 734 self.__formatter.EndStyle(style) 735 # Recursively format everything following the match. 736 self.__WriteText(text[match.start(3):]) 737 return 738 739 # Look for hyperlink markup. 740 match = self.__link_regex.search(text) 741 if match is not None: 742 link_text = string.strip(match.group(1)) 743 # Is there a footnote providing a link target for this 744 # phrase? 745 if self.__hyperlinks.has_key(link_text): 746 # Yes. Emit a hyperlink. 747 link_target = self.__hyperlinks[link_text] 748 # Recursively format everything up to the start of the 749 # match. 750 self.__WriteText(text[:match.start(0)]) 751 # Generate the start of the link. 752 self.__formatter.StartLink(link_target) 753 # Recursively format the link text. 754 self.__WriteText(match.group(1)) 755 # End the link. 756 self.__formatter.EndLink() 757 # Recursively format everything following the match. 758 self.__WriteText(text[match.end(1) + 1:]) 759 return 760 else: 761 # Fall through and format the entire text as usual. 762 pass 763 764 # Nothing special. Write ordinary text. 765 self.__formatter.WriteText(text)
766 767 768 ######################################################################## 769 # functions 770 ######################################################################## 771
772 -def escape_html_entities(text):
773 """Return 'text' with special characters converted to HTML entities.""" 774 775 return __entity_char_regex.sub(__entity_char_replacement, text)
776 777
778 -def __format(text, formatter):
779 """Process structured text 'text' with 'formatter'.""" 780 781 processor = StructuredTextProcessor(formatter) 782 processor(text) 783 processor.End()
784 785
786 -def to_html(structured_text):
787 """Return 'structured_text' formatted as HTML.""" 788 789 # Create an HTML formatter that dumps its output to a StringIO. 790 output_string = cStringIO.StringIO() 791 formatter = HtmlFormatter(output_string) 792 # Generate output. 793 __format(structured_text, formatter) 794 # Return the resulting text. 795 return output_string.getvalue()
796 797
798 -def to_text(structured_text, width=78, indent=0):
799 """Return 'structured_text' formatted as plain text. 800 801 'width' -- The width of the text (including the indentation). 802 803 'indent' -- The width of the block indentation of the formatted 804 output.""" 805 806 # Create a text formatter that dumps its output to a StringIO. 807 output_string = cStringIO.StringIO() 808 formatter = TextFormatter(output_string, width=width, indent=indent) 809 # Generate output. 810 __format(structured_text, formatter) 811 # Return the resulting text. 812 return output_string.getvalue()
813 814
815 -def get_first(structured_text):
816 """Return the first line of 'structured_text'. 817 818 By convention, the first line of a structured text description is a 819 short summary.""" 820 821 return string.split(structured_text, "\n", 1)[0]
822 823
824 -def get_rest(structured_text):
825 """Return the contents of 'structured_text' minus the first line.""" 826 827 parts = string.split(structured_text, "\n", 1) 828 # There may not be more than one line; handle this gracefully. 829 if len(parts) > 0: 830 return parts[1] 831 else: 832 return ""
833 834
835 -def get_paragraphs(structured_text):
836 """Split 'structured_text' into paragraphs. 837 838 'structured_text' -- A string consisting of structured text. 839 840 returns -- A sequence of pagraphs of structured text. Each 841 element in the sequence corresponds to a successive pagraph 842 in the 'structured_text'. If 'structured_text' is the empty 843 string, the sequence returned will consist of a single 844 paragraph, itself empty.""" 845 846 # There are no paragraphs yet. 847 paragraphs = [] 848 # The first paragraph begins at the first character. 849 begin = 0 850 # We have not yet found the end of the paragraph. 851 end = 0 852 # Keep going until there is no more text. 853 while end < len(structured_text): 854 # If we are at the start of a paragraph, check to see if 855 # we might be looking at a piece of verbatim text. 856 if (len(structured_text) - end >= 6 857 and structured_text[end:end+3] == "'''"): 858 end = string.find(structured_text, "'''", end + 3) 859 if end > 0: 860 end = end + 3 861 # Add the new paragraph to the ist. 862 paragraphs.append(structured_text[begin:end]) 863 begin = end 864 continue 865 else: 866 # Loop through the string until we find the end of the 867 # text. 868 while end < len(structured_text): 869 # See if we are at the end of a paragraph. 870 match = __paragraph_regexp.match(structured_text, end) 871 if match: 872 # Add the new paragraph to the list. 873 paragraphs.append(structured_text[begin:end]) 874 # The next paragraph begins with the first 875 # matched character. 876 begin = match.end() 877 end = begin 878 break 879 else: 880 # Advance to the next character. 881 end = end + 1 882 883 # We may have stopped in the middle of a paragraph. 884 if begin != end: 885 paragraphs.append(structured_text[begin:end]) 886 887 return paragraphs
888 889
890 -def get_first_paragraph(structured_text):
891 """Return the first paragraph of 'structured_text'. 892 893 'structured_text' -- A string consisting of structured text. 894 895 returns -- A string of structured text that is the first paragraph 896 of the 'structured_text'.""" 897 898 return get_paragraphs(structured_text)[0]
899 900 ######################################################################## 901 # variables 902 ######################################################################## 903 904 # Write a regular expression for finding characters that need to be 905 # escaped as HTML entities. 906 __entity_char_regex = htmlentitydefs.entitydefs.values() 907 # We only handle single-byte characters. 908 __entity_char_regex = filter(lambda l: len(l) == 1, __entity_char_regex) 909 __entity_char_regex = "[" + string.join(__entity_char_regex, "") + "]" 910 __entity_char_regex = re.compile(__entity_char_regex) 911 912 # Generate a replacement function for special characters to HTML 913 # entities. Start by creating a map from the character to the 914 # corresponding HTML entity code. 915 __entity_char_replacement = {} 916 for entity, character in htmlentitydefs.entitydefs.items(): 917 if len(character) == 1: 918 __entity_char_replacement[character] = "&%s;" % entity 919 # Write a function for use as the regex replacement that looks up the 920 # corresponding entity for a matched character. 921 __entity_char_replacement = lambda match, \ 922 replacement_map=__entity_char_replacement: \ 923 replacement_map[match.group(0)] 924 925 # Regex matching paragraph separators. 926 __paragraph_regexp = re.compile("(?:\n *)+\n") 927 928 # Regular expression matching verbatim paragraphs and trailing 929 # whitespace. 930 _verbatim_regexp = re.compile("('''.*''')(?:(?:\n *)+\n|\n?$)", re.DOTALL) 931 932 ######################################################################## 933 # script 934 ######################################################################## 935 936 # If invoked as a script, act as a structured text processor. 937 938 if __name__ == "__main__": 939 # Parse command-line options. 940 import getopt 941 long_options = [ 942 "html", 943 "text", 944 ] 945 options, arguments = getopt.getopt(sys.argv[1:], "", long_options) 946 # Interpret them. 947 formatter = None 948 for option, option_argument in options: 949 if option == "--html": 950 formatter = HtmlFormatter() 951 elif option == "--text": 952 formatter = TextFormatter() 953 # Use a text formatter by default. 954 if formatter is None: 955 formatter = TextFormatter() 956 957 # Fire up a processor. 958 processor = StructuredTextProcessor(formatter) 959 960 # Were input files specified on the command line? 961 if len(arguments) == 0: 962 # No; read from standard input. 963 inputs = (sys.stdin, ) 964 else: 965 # Yes; open them all. 966 inputs = map(lambda file_name: open(file_name, "rt"), arguments) 967 968 # Loop over inputs. 969 for input in inputs: 970 # Read in each one, and process it. 971 processor(input.read()) 972 973 # End processing. 974 processor.End() 975 976 # All done. 977 sys.exit(0) 978 979 980 ######################################################################## 981 # Local Variables: 982 # mode: python 983 # indent-tabs-mode: nil 984 # fill-column: 72 985 # End: 986