Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 4 Jan 2003 22:14:31 +0800 (CST)
From:      Cheng-Lung Sung <AlanSung@alansung.dragon2.net>
To:        FreeBSD-gnats-submit@FreeBSD.org
Cc:        clsung@dragon2.net
Subject:   ports/46754: [PATCH] pdftohtml add Plain-Text output
Message-ID:  <200301041414.h04EEV4B070967@alansung.dragon2.net>

next in thread | raw e-mail | index | archive | help

>Number:         46754
>Category:       ports
>Synopsis:       [PATCH] pdftohtml add Plain-Text output
>Confidential:   no
>Severity:       non-critical
>Priority:       low
>Responsible:    freebsd-ports
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          change-request
>Submitter-Id:   current-users
>Arrival-Date:   Sat Jan 04 06:20:01 PST 2003
>Closed-Date:
>Last-Modified:
>Originator:     Cheng-Lung Sung <clsung@dragon2.net>
>Release:        FreeBSD 4.7-RELEASE-p2 i386
>Organization:
NCTU CSIE FreeBSD Server
>Environment:
System: FreeBSD AlanSung.dragon2.net 4.7-RELEASE-p2 FreeBSD 4.7-RELEASE-p2 #2: Mon Nov 18 12:12:55 CST 2002 root@AlanSung.dragon2.net:/usr/obj/usr/src/sys/SungSung i386


>Description:
    Original pdftohtml have two output features : html/xml, but I have to 
    use it converted to plain-text file, a useful command is 
    pdftohtml -i -noframes -stdout something.pdf | w3m -dump -T text/html - > something.txt
    . but if user do not have w3m installed, he must find alternate.
>How-To-Repeat:
>Fix:
    after patch, user can use command 
    %pdftohtml -i -noframes -text something.pdf something
    to generate plain-text pdf files

diff -ruN /usr/ports/textproc/pdftohtml/files/patch-HtmlFonts.cc /tmp/pdftohtml/files/patch-HtmlFonts.cc
--- /usr/ports/textproc/pdftohtml/files/patch-HtmlFonts.cc	Thu Jan  1 08:00:00 1970
+++ /tmp/pdftohtml/files/patch-HtmlFonts.cc	Sat Jan  4 21:38:39 2003
@@ -0,0 +1,34 @@
+--- src/HtmlFonts.cc.orig	Mon Jun 17 03:05:55 2002
++++ src/HtmlFonts.cc	Sat Jan  4 21:30:42 2003
+@@ -27,6 +27,7 @@
+ 
+ #define xoutRound(x) ((int)(x + 0.5))
+ extern GBool xml;
++extern GBool text;
+ 
+ const int font_num=13;
+ GString* HtmlFont::DefaultFont=new GString("Times"); // Arial,Helvetica,sans-serif
+@@ -194,10 +195,10 @@
+   for (int i = 0; i < uLen; ++i) {
+     switch (u[i])
+       { 
+-	case '"': tmp->append("&quot;");  break;
+-	case '&': tmp->append("&amp;");  break;
+-	case '<': tmp->append("&lt;");  break;
+-	case '>': tmp->append("&gt;");  break;
++	case '"': text ? tmp->append("\"") : tmp->append("&quot;");  break;
++	case '&': text ? tmp->append("&") : tmp->append("&amp;");  break;
++	case '<': text ? tmp->append("<") : tmp->append("&lt;");  break;
++	case '>': text ? tmp->append(">") : tmp->append("&gt;");  break;
+ 	default:  
+ 	  {
+ 	    // convert unicode to string
+@@ -248,7 +249,7 @@
+   GString *tmp;
+   GString *iStr=GString::IntToStr(i);
+   
+-  if (!xml) {
++  if (!xml && !text) {
+     tmp = new GString("<span class=\"ft");
+     tmp->append(iStr);
+     tmp->append("\">");
diff -ruN /usr/ports/textproc/pdftohtml/files/patch-HtmlOutputDev.cc /tmp/pdftohtml/files/patch-HtmlOutputDev.cc
--- /usr/ports/textproc/pdftohtml/files/patch-HtmlOutputDev.cc	Thu Jan  1 08:00:00 1970
+++ /tmp/pdftohtml/files/patch-HtmlOutputDev.cc	Sat Jan  4 22:04:19 2003
@@ -0,0 +1,220 @@
+--- src/HtmlOutputDev.cc.orig	Mon Jun 17 03:15:33 2002
++++ src/HtmlOutputDev.cc	Sat Jan  4 22:00:29 2003
+@@ -42,6 +42,7 @@
+ extern GBool noframes;
+ extern GBool stout;
+ extern GBool xml;
++extern GBool text;
+ extern GBool showHidden;
+ extern GBool noMerge;
+ 
+@@ -475,6 +476,106 @@
+ 
+ }
+ 
++void HtmlPage::textcoalesce() {
++  HtmlString *str1, *str2;
++  HtmlFont *hfont1, *hfont2;
++  double space, d, vertSpace;
++  GBool addSpace, addLineBreak;
++  int n, i;
++  double curX, curY;
++
++  str1 = yxStrings;
++
++  if( !str1 ) return;
++
++  hfont1 = getFont(str1);
++  curX = str1->xMin; curY = str1->yMin;
++
++  while (str1 && (str2 = str1->yxNext)) {
++    hfont2 = getFont(str2);
++    space = str1->yMax - str1->yMin;
++    d = str2->xMin - str1->xMax;
++    addLineBreak = !noMerge && (fabs(str1->xMin - str2->xMin) < 0.4);
++    vertSpace = str2->yMin - str1->yMax;
++    if (((((rawOrder &&
++	  ((str2->yMin >= str1->yMin && str2->yMin <= str1->yMax) ||
++	   (str2->yMax >= str1->yMin && str2->yMax <= str1->yMax))) ||
++	 (!rawOrder && str2->yMin < str1->yMax)) &&
++	d > -0.5 * space && d < space) ||
++       (vertSpace >= 0 && vertSpace < 0.5 * space && 
++	addLineBreak)) &&
++	(hfont1->isEqualIgnoreBold(*hfont2))
++ 	) 
++    {
++	n = str1->len + str2->len;
++	if ((addSpace = d > 0.1 * space)) {
++	    ++n;
++	}
++	if (addLineBreak) {
++	    ++n;
++	}
++
++      str1->size = (n + 15) & ~15;
++      str1->text = (Unicode *)grealloc(str1->text,
++				       str1->size * sizeof(Unicode));
++      str1->xRight = (double *)grealloc(str1->xRight,
++					str1->size * sizeof(double));
++      if (addSpace) {
++	str1->text[str1->len] = 0x20;
++	str1->htext->append(" ");
++	str1->xRight[str1->len] = str2->xMin;
++	++str1->len;
++      }
++      if (addLineBreak) {
++	  str1->text[str1->len] = '\n';
++	  str1->htext->append("\n");
++	  str1->xRight[str1->len] = str2->xMin;
++	  ++str1->len;
++	  str1->yMin = str2->yMin;
++	  str1->yMax = str2->yMax;
++	  str1->xMax = str2->xMax;
++	  int fontLineSize = hfont1->getLineSize();
++	  int curLineSize = (int)(vertSpace + space); 
++	  if( curLineSize != fontLineSize )
++	  {
++	      HtmlFont *newfnt = new HtmlFont(*hfont1);
++	      newfnt->setLineSize(curLineSize);
++	      str1->fontpos = fonts->AddFont(*newfnt);
++	      delete newfnt;
++	      hfont1 = getFont(str1);
++	      // we have to reget hfont2 because it's location could have
++	      // changed on resize
++	      hfont2 = getFont(str2); 
++	  }
++      }
++      for (i = 0; i < str2->len; ++i) {
++	str1->text[str1->len] = str2->text[i];
++	str1->xRight[str1->len] = str2->xRight[i];
++	++str1->len;
++      }
++
++      str1->htext->append(str2->htext);
++      // str1 now contains href for link of str2 (if it is defined)
++      str1->link = str2->link; 
++      hfont1 = hfont2;
++      if (str2->xMax > str1->xMax) {
++	str1->xMax = str2->xMax;
++      }
++      if (str2->yMax > str1->yMax) {
++	str1->yMax = str2->yMax;
++      }
++      str1->yxNext = str2->yxNext;
++      delete str2;
++    } else {
++      str1->xMin = curX; str1->yMin = curY; 
++      str1 = str2;
++      curX = str1->xMin; curY = str1->yMin;
++      hfont1 = hfont2;
++    }
++  }
++  str1->xMin = curX; str1->yMin = curY;
++}
++
+ void HtmlPage::dumpAsXML(FILE* f,int page){  
+   fprintf(f, "<page number=\"%d\" position=\"absolute\"", page);
+   fprintf(f," top=\"0\" left=\"0\" height=\"%d\" width=\"%d\">\n", pageHeight,pageWidth);
+@@ -504,6 +605,24 @@
+   fputs("</page>\n",f);
+ }
+ 
++void HtmlPage::dumpAsTEXT(FILE* f,int page){  
++  fprintf(f, "**** page number=\"%d\" ****\n", page);
++    
++  GString *str, *str1;
++  for(HtmlString *tmp=yxStrings;tmp;tmp=tmp->yxNext){
++    if (tmp->htext){
++      str=new GString(tmp->htext);
++      if (tmp->fontpos!=-1){
++	str1=fonts->getCSStyle(tmp->fontpos, str);
++      }
++      fputs(str1->getCString(),f);
++      delete str;
++      delete str1;
++      fputs("\n",f);
++    }
++  }
++  fputs("\n",f);
++}
+ 
+ void HtmlPage::dumpComplex(FILE *file, int page){
+   FILE* pageFile;
+@@ -598,8 +717,13 @@
+ 
+   nump++;
+   if (mode){
+-    if (xml) dumpAsXML(f,nump);
+-    if (!xml) dumpComplex(f, nump);  
++    if (xml) { 
++	dumpAsXML(f,nump);
++    } else if(text) {
++	dumpAsTEXT(f,nump);
++    } else {
++	dumpComplex(f, nump);  
++    };
+   }
+   else{
+     fprintf(f,"<a name=%d></a>",nump);
+@@ -724,7 +848,7 @@
+     }*/
+ 
+   //Complex and simple doc with frames
+-  if(!xml&&!noframes){
++  if(!text&&!xml&&!noframes){
+      GString* left=new GString(fileName);
+      left->append("_ind.html");
+      doFrame();
+@@ -755,8 +879,13 @@
+     if (stout) page=stdout;
+     else {
+       GString* right=new GString(fileName);
+-      if (!xml) right->append(".html");
+-      if (xml) right->append(".xml");
++      if (xml) {
++	  right->append(".xml");
++      } else if (text) {
++	  right->append(".txt");
++      } else {
++          right->append(".html");
++      }
+       if (!(page=fopen(right->getCString(),"w"))){
+ 	delete right;
+ 	error(-1, "Couldn't open html file '%s'", right->getCString());
+@@ -768,6 +897,8 @@
+       fputs("<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n", page);
+       fputs("<!DOCTYPE pdf2xml SYSTEM \"pdf2xml.dtd\">\n\n", page);
+       fputs("<pdf2xml>\n",page);
++    } else if (text) {
++    // no need anything
+     } else {
+       fprintf(page,"<html>\n<head>\n<title>%s</title>\n",docTitle->getCString());//tmp->getCString());
+       fprintf(page, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=%s\">\n", globalParams->getTextEncodingName()->getCString());
+@@ -801,6 +932,8 @@
+     if (xml) {
+       fputs("</pdf2xml>\n",page);  
+       fclose(page);
++    } else if (text) {
++      fclose(page);
+     } else
+     if (!mode||xml){ 
+       fputs("</body>\n</html>\n",page);  
+@@ -851,7 +984,8 @@
+ 
+ void HtmlOutputDev::endPage() {
+   pages->conv();
+-  pages->coalesce();
++  if (text) pages->textcoalesce();
++  else pages->coalesce();
+   pages->dump(page);
+   
+   // I don't yet know what to do in the case when there are pages of different
+@@ -860,7 +994,7 @@
+   maxPageWidth = pages->pageWidth;
+   maxPageHeight = pages->pageHeight;
+   
+-  if(!noframes&&!xml) fputs("<br>", f);
++  if(!noframes&&!xml&&!text) fputs("<br>", f);
+   if(!stout && !globalParams->getErrQuiet()) printf("Page-%d\n",(pageNum));
+   pageNum++ ;
+ }
diff -ruN /usr/ports/textproc/pdftohtml/files/patch-HtmlOutputDev.h /tmp/pdftohtml/files/patch-HtmlOutputDev.h
--- /usr/ports/textproc/pdftohtml/files/patch-HtmlOutputDev.h	Thu Jan  1 08:00:00 1970
+++ /tmp/pdftohtml/files/patch-HtmlOutputDev.h	Sat Jan  4 22:04:07 2003
@@ -0,0 +1,19 @@
+--- src/HtmlOutputDev.h.orig	Tue Jun 18 09:44:18 2002
++++ src/HtmlOutputDev.h	Sat Jan  4 21:56:00 2003
+@@ -106,6 +106,8 @@
+ 
+   // Coalesce strings that look like parts of the same line.
+   void coalesce();
++  // Coalesce strings that look like parts of the same line. text ver.
++  void textcoalesce();
+ 
+   // Find a string.  If <top> is true, starts looking at top of page;
+   // otherwise starts looking at <xMin>,<yMin>.  If <bottom> is true,
+@@ -139,6 +141,7 @@
+   
+   void setDocName(char* fname);
+   void dumpAsXML(FILE* f,int page);
++  void dumpAsTEXT(FILE* f,int page);
+   void dumpComplex(FILE* f, int page);
+ 
+   // marks the position of the fonts that belong to current page (for noframes)
diff -ruN /usr/ports/textproc/pdftohtml/files/patch-pdftohtml.cc /tmp/pdftohtml/files/patch-pdftohtml.cc
--- /usr/ports/textproc/pdftohtml/files/patch-pdftohtml.cc	Thu Jan  1 08:00:00 1970
+++ /tmp/pdftohtml/files/patch-pdftohtml.cc	Sat Jan  4 21:38:19 2003
@@ -0,0 +1,72 @@
+--- src/pdftohtml.cc.orig	Tue Jun 18 09:44:31 2002
++++ src/pdftohtml.cc	Sat Jan  4 21:30:47 2003
+@@ -42,6 +42,7 @@
+ GBool noframes=gFalse;
+ GBool stout=gFalse;
+ GBool xml=gFalse;
++GBool text=gFalse;
+ GBool errQuiet=gFalse;
+ 
+ GBool showHidden = gFalse;
+@@ -81,6 +82,8 @@
+    "zoom the pdf document (default 1.5)"},
+   {"-xml",    argFlag,    &xml,         0,
+    "output for XML post-processing"},
++  {"-text",    argFlag,    &text,         0,
++   "output to Plain-Text "},
+   {"-hidden", argFlag,   &showHidden,   0,
+    "output hidden text"},
+   {"-nomerge", argFlag, &noMerge, 0,
+@@ -117,7 +120,7 @@
+     fprintf(stderr, "based on Xpdf version %s\n", xpdfVersion);
+     fprintf(stderr, "%s\n\n", xpdfCopyright);
+     if (!printVersion) {
+-      printUsage("pdftohtml", "<PDF-file> [<html-file> <xml-file>]", argDesc);
++      printUsage("pdftohtml", "<PDF-file> [<html-file> <xml-file> <text-file>]", argDesc);
+     }
+     exit(1);
+   }
+@@ -175,16 +178,23 @@
+   if (argc == 3) {
+     GString* tmp = new GString(argv[2]);
+     p=tmp->getCString()+tmp->getLength()-5;
+-    if (!xml)
++    if (!xml && !text) {
+       if (!strcmp(p, ".html") || !strcmp(p, ".HTML"))
+ 	htmlFileName = new GString(tmp->getCString(),
+ 				   tmp->getLength() - 5);
+       else htmlFileName =new GString(tmp);
+-    else   
++    } else if (xml) {  
+       if (!strcmp(p, ".xml") || !strcmp(p, ".XML"))
+ 	htmlFileName = new GString(tmp->getCString(),
+ 				   tmp->getLength() - 5);
+       else htmlFileName =new GString(tmp);
++    } else if (text) {
++      if (!strcmp(p, ".txt") || !strcmp(p, ".TXT"))
++	htmlFileName = new GString(tmp->getCString(),
++				   tmp->getLength() - 5);
++      else htmlFileName =new GString(tmp);
++    }
++
+     
+     delete tmp;
+   } else {
+@@ -210,7 +220,7 @@
+      mode=gFalse;
+    }
+ 
+-   if (xml)
++   if (xml || text)
+    { 
+        mode = gTrue;
+        noframes = gTrue;
+@@ -237,7 +247,7 @@
+   if (htmlOut->isOk())  
+     doc->displayPages(htmlOut, firstPage, lastPage, static_cast<int>(72*scale), 0, gTrue);
+   
+-  if( mode && !xml && !ignore ) {
++  if( mode && !xml && !ignore && !text) {
+     int h=xoutRound(htmlOut->getPageHeight()/scale);
+     int w=xoutRound(htmlOut->getPageWidth()/scale);
+     //int h=xoutRound(doc->getPageHeight(1)/scale);

>Release-Note:
>Audit-Trail:
>Unformatted:

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-ports" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200301041414.h04EEV4B070967>