Date: Sat, 4 Jan 2003 22:14:31 +0800 (CST) From: Cheng-Lung Sung <AlanSung@alansung.dragon2.net> To: FreeBSD-gnats-submit@FreeBSD.org Cc: clsung@dragon2.net Subject: ports/46754: [PATCH] pdftohtml add Plain-Text output Message-ID: <200301041414.h04EEV4B070967@alansung.dragon2.net>
next in thread | raw e-mail | index | archive | help
>Number: 46754 >Category: ports >Synopsis: [PATCH] pdftohtml add Plain-Text output >Confidential: no >Severity: non-critical >Priority: low >Responsible: freebsd-ports >State: open >Quarter: >Keywords: >Date-Required: >Class: change-request >Submitter-Id: current-users >Arrival-Date: Sat Jan 04 06:20:01 PST 2003 >Closed-Date: >Last-Modified: >Originator: Cheng-Lung Sung <clsung@dragon2.net> >Release: FreeBSD 4.7-RELEASE-p2 i386 >Organization: NCTU CSIE FreeBSD Server >Environment: System: FreeBSD AlanSung.dragon2.net 4.7-RELEASE-p2 FreeBSD 4.7-RELEASE-p2 #2: Mon Nov 18 12:12:55 CST 2002 root@AlanSung.dragon2.net:/usr/obj/usr/src/sys/SungSung i386 >Description: Original pdftohtml have two output features : html/xml, but I have to use it converted to plain-text file, a useful command is pdftohtml -i -noframes -stdout something.pdf | w3m -dump -T text/html - > something.txt . but if user do not have w3m installed, he must find alternate. >How-To-Repeat: >Fix: after patch, user can use command %pdftohtml -i -noframes -text something.pdf something to generate plain-text pdf files diff -ruN /usr/ports/textproc/pdftohtml/files/patch-HtmlFonts.cc /tmp/pdftohtml/files/patch-HtmlFonts.cc --- /usr/ports/textproc/pdftohtml/files/patch-HtmlFonts.cc Thu Jan 1 08:00:00 1970 +++ /tmp/pdftohtml/files/patch-HtmlFonts.cc Sat Jan 4 21:38:39 2003 @@ -0,0 +1,34 @@ +--- src/HtmlFonts.cc.orig Mon Jun 17 03:05:55 2002 ++++ src/HtmlFonts.cc Sat Jan 4 21:30:42 2003 +@@ -27,6 +27,7 @@ + + #define xoutRound(x) ((int)(x + 0.5)) + extern GBool xml; ++extern GBool text; + + const int font_num=13; + GString* HtmlFont::DefaultFont=new GString("Times"); // Arial,Helvetica,sans-serif +@@ -194,10 +195,10 @@ + for (int i = 0; i < uLen; ++i) { + switch (u[i]) + { +- case '"': tmp->append("""); break; +- case '&': tmp->append("&"); break; +- case '<': tmp->append("<"); break; +- case '>': tmp->append(">"); break; ++ case '"': text ? tmp->append("\"") : tmp->append("""); break; ++ case '&': text ? tmp->append("&") : tmp->append("&"); break; ++ case '<': text ? tmp->append("<") : tmp->append("<"); break; ++ case '>': text ? tmp->append(">") : tmp->append(">"); break; + default: + { + // convert unicode to string +@@ -248,7 +249,7 @@ + GString *tmp; + GString *iStr=GString::IntToStr(i); + +- if (!xml) { ++ if (!xml && !text) { + tmp = new GString("<span class=\"ft"); + tmp->append(iStr); + tmp->append("\">"); diff -ruN /usr/ports/textproc/pdftohtml/files/patch-HtmlOutputDev.cc /tmp/pdftohtml/files/patch-HtmlOutputDev.cc --- /usr/ports/textproc/pdftohtml/files/patch-HtmlOutputDev.cc Thu Jan 1 08:00:00 1970 +++ /tmp/pdftohtml/files/patch-HtmlOutputDev.cc Sat Jan 4 22:04:19 2003 @@ -0,0 +1,220 @@ +--- src/HtmlOutputDev.cc.orig Mon Jun 17 03:15:33 2002 ++++ src/HtmlOutputDev.cc Sat Jan 4 22:00:29 2003 +@@ -42,6 +42,7 @@ + extern GBool noframes; + extern GBool stout; + extern GBool xml; ++extern GBool text; + extern GBool showHidden; + extern GBool noMerge; + +@@ -475,6 +476,106 @@ + + } + ++void HtmlPage::textcoalesce() { ++ HtmlString *str1, *str2; ++ HtmlFont *hfont1, *hfont2; ++ double space, d, vertSpace; ++ GBool addSpace, addLineBreak; ++ int n, i; ++ double curX, curY; ++ ++ str1 = yxStrings; ++ ++ if( !str1 ) return; ++ ++ hfont1 = getFont(str1); ++ curX = str1->xMin; curY = str1->yMin; ++ ++ while (str1 && (str2 = str1->yxNext)) { ++ hfont2 = getFont(str2); ++ space = str1->yMax - str1->yMin; ++ d = str2->xMin - str1->xMax; ++ addLineBreak = !noMerge && (fabs(str1->xMin - str2->xMin) < 0.4); ++ vertSpace = str2->yMin - str1->yMax; ++ if (((((rawOrder && ++ ((str2->yMin >= str1->yMin && str2->yMin <= str1->yMax) || ++ (str2->yMax >= str1->yMin && str2->yMax <= str1->yMax))) || ++ (!rawOrder && str2->yMin < str1->yMax)) && ++ d > -0.5 * space && d < space) || ++ (vertSpace >= 0 && vertSpace < 0.5 * space && ++ addLineBreak)) && ++ (hfont1->isEqualIgnoreBold(*hfont2)) ++ ) ++ { ++ n = str1->len + str2->len; ++ if ((addSpace = d > 0.1 * space)) { ++ ++n; ++ } ++ if (addLineBreak) { ++ ++n; ++ } ++ ++ str1->size = (n + 15) & ~15; ++ str1->text = (Unicode *)grealloc(str1->text, ++ str1->size * sizeof(Unicode)); ++ str1->xRight = (double *)grealloc(str1->xRight, ++ str1->size * sizeof(double)); ++ if (addSpace) { ++ str1->text[str1->len] = 0x20; ++ str1->htext->append(" "); ++ str1->xRight[str1->len] = str2->xMin; ++ ++str1->len; ++ } ++ if (addLineBreak) { ++ str1->text[str1->len] = '\n'; ++ str1->htext->append("\n"); ++ str1->xRight[str1->len] = str2->xMin; ++ ++str1->len; ++ str1->yMin = str2->yMin; ++ str1->yMax = str2->yMax; ++ str1->xMax = str2->xMax; ++ int fontLineSize = hfont1->getLineSize(); ++ int curLineSize = (int)(vertSpace + space); ++ if( curLineSize != fontLineSize ) ++ { ++ HtmlFont *newfnt = new HtmlFont(*hfont1); ++ newfnt->setLineSize(curLineSize); ++ str1->fontpos = fonts->AddFont(*newfnt); ++ delete newfnt; ++ hfont1 = getFont(str1); ++ // we have to reget hfont2 because it's location could have ++ // changed on resize ++ hfont2 = getFont(str2); ++ } ++ } ++ for (i = 0; i < str2->len; ++i) { ++ str1->text[str1->len] = str2->text[i]; ++ str1->xRight[str1->len] = str2->xRight[i]; ++ ++str1->len; ++ } ++ ++ str1->htext->append(str2->htext); ++ // str1 now contains href for link of str2 (if it is defined) ++ str1->link = str2->link; ++ hfont1 = hfont2; ++ if (str2->xMax > str1->xMax) { ++ str1->xMax = str2->xMax; ++ } ++ if (str2->yMax > str1->yMax) { ++ str1->yMax = str2->yMax; ++ } ++ str1->yxNext = str2->yxNext; ++ delete str2; ++ } else { ++ str1->xMin = curX; str1->yMin = curY; ++ str1 = str2; ++ curX = str1->xMin; curY = str1->yMin; ++ hfont1 = hfont2; ++ } ++ } ++ str1->xMin = curX; str1->yMin = curY; ++} ++ + void HtmlPage::dumpAsXML(FILE* f,int page){ + fprintf(f, "<page number=\"%d\" position=\"absolute\"", page); + fprintf(f," top=\"0\" left=\"0\" height=\"%d\" width=\"%d\">\n", pageHeight,pageWidth); +@@ -504,6 +605,24 @@ + fputs("</page>\n",f); + } + ++void HtmlPage::dumpAsTEXT(FILE* f,int page){ ++ fprintf(f, "**** page number=\"%d\" ****\n", page); ++ ++ GString *str, *str1; ++ for(HtmlString *tmp=yxStrings;tmp;tmp=tmp->yxNext){ ++ if (tmp->htext){ ++ str=new GString(tmp->htext); ++ if (tmp->fontpos!=-1){ ++ str1=fonts->getCSStyle(tmp->fontpos, str); ++ } ++ fputs(str1->getCString(),f); ++ delete str; ++ delete str1; ++ fputs("\n",f); ++ } ++ } ++ fputs("\n",f); ++} + + void HtmlPage::dumpComplex(FILE *file, int page){ + FILE* pageFile; +@@ -598,8 +717,13 @@ + + nump++; + if (mode){ +- if (xml) dumpAsXML(f,nump); +- if (!xml) dumpComplex(f, nump); ++ if (xml) { ++ dumpAsXML(f,nump); ++ } else if(text) { ++ dumpAsTEXT(f,nump); ++ } else { ++ dumpComplex(f, nump); ++ }; + } + else{ + fprintf(f,"<a name=%d></a>",nump); +@@ -724,7 +848,7 @@ + }*/ + + //Complex and simple doc with frames +- if(!xml&&!noframes){ ++ if(!text&&!xml&&!noframes){ + GString* left=new GString(fileName); + left->append("_ind.html"); + doFrame(); +@@ -755,8 +879,13 @@ + if (stout) page=stdout; + else { + GString* right=new GString(fileName); +- if (!xml) right->append(".html"); +- if (xml) right->append(".xml"); ++ if (xml) { ++ right->append(".xml"); ++ } else if (text) { ++ right->append(".txt"); ++ } else { ++ right->append(".html"); ++ } + if (!(page=fopen(right->getCString(),"w"))){ + delete right; + error(-1, "Couldn't open html file '%s'", right->getCString()); +@@ -768,6 +897,8 @@ + fputs("<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n", page); + fputs("<!DOCTYPE pdf2xml SYSTEM \"pdf2xml.dtd\">\n\n", page); + fputs("<pdf2xml>\n",page); ++ } else if (text) { ++ // no need anything + } else { + fprintf(page,"<html>\n<head>\n<title>%s</title>\n",docTitle->getCString());//tmp->getCString()); + fprintf(page, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=%s\">\n", globalParams->getTextEncodingName()->getCString()); +@@ -801,6 +932,8 @@ + if (xml) { + fputs("</pdf2xml>\n",page); + fclose(page); ++ } else if (text) { ++ fclose(page); + } else + if (!mode||xml){ + fputs("</body>\n</html>\n",page); +@@ -851,7 +984,8 @@ + + void HtmlOutputDev::endPage() { + pages->conv(); +- pages->coalesce(); ++ if (text) pages->textcoalesce(); ++ else pages->coalesce(); + pages->dump(page); + + // I don't yet know what to do in the case when there are pages of different +@@ -860,7 +994,7 @@ + maxPageWidth = pages->pageWidth; + maxPageHeight = pages->pageHeight; + +- if(!noframes&&!xml) fputs("<br>", f); ++ if(!noframes&&!xml&&!text) fputs("<br>", f); + if(!stout && !globalParams->getErrQuiet()) printf("Page-%d\n",(pageNum)); + pageNum++ ; + } diff -ruN /usr/ports/textproc/pdftohtml/files/patch-HtmlOutputDev.h /tmp/pdftohtml/files/patch-HtmlOutputDev.h --- /usr/ports/textproc/pdftohtml/files/patch-HtmlOutputDev.h Thu Jan 1 08:00:00 1970 +++ /tmp/pdftohtml/files/patch-HtmlOutputDev.h Sat Jan 4 22:04:07 2003 @@ -0,0 +1,19 @@ +--- src/HtmlOutputDev.h.orig Tue Jun 18 09:44:18 2002 ++++ src/HtmlOutputDev.h Sat Jan 4 21:56:00 2003 +@@ -106,6 +106,8 @@ + + // Coalesce strings that look like parts of the same line. + void coalesce(); ++ // Coalesce strings that look like parts of the same line. text ver. ++ void textcoalesce(); + + // Find a string. If <top> is true, starts looking at top of page; + // otherwise starts looking at <xMin>,<yMin>. If <bottom> is true, +@@ -139,6 +141,7 @@ + + void setDocName(char* fname); + void dumpAsXML(FILE* f,int page); ++ void dumpAsTEXT(FILE* f,int page); + void dumpComplex(FILE* f, int page); + + // marks the position of the fonts that belong to current page (for noframes) diff -ruN /usr/ports/textproc/pdftohtml/files/patch-pdftohtml.cc /tmp/pdftohtml/files/patch-pdftohtml.cc --- /usr/ports/textproc/pdftohtml/files/patch-pdftohtml.cc Thu Jan 1 08:00:00 1970 +++ /tmp/pdftohtml/files/patch-pdftohtml.cc Sat Jan 4 21:38:19 2003 @@ -0,0 +1,72 @@ +--- src/pdftohtml.cc.orig Tue Jun 18 09:44:31 2002 ++++ src/pdftohtml.cc Sat Jan 4 21:30:47 2003 +@@ -42,6 +42,7 @@ + GBool noframes=gFalse; + GBool stout=gFalse; + GBool xml=gFalse; ++GBool text=gFalse; + GBool errQuiet=gFalse; + + GBool showHidden = gFalse; +@@ -81,6 +82,8 @@ + "zoom the pdf document (default 1.5)"}, + {"-xml", argFlag, &xml, 0, + "output for XML post-processing"}, ++ {"-text", argFlag, &text, 0, ++ "output to Plain-Text "}, + {"-hidden", argFlag, &showHidden, 0, + "output hidden text"}, + {"-nomerge", argFlag, &noMerge, 0, +@@ -117,7 +120,7 @@ + fprintf(stderr, "based on Xpdf version %s\n", xpdfVersion); + fprintf(stderr, "%s\n\n", xpdfCopyright); + if (!printVersion) { +- printUsage("pdftohtml", "<PDF-file> [<html-file> <xml-file>]", argDesc); ++ printUsage("pdftohtml", "<PDF-file> [<html-file> <xml-file> <text-file>]", argDesc); + } + exit(1); + } +@@ -175,16 +178,23 @@ + if (argc == 3) { + GString* tmp = new GString(argv[2]); + p=tmp->getCString()+tmp->getLength()-5; +- if (!xml) ++ if (!xml && !text) { + if (!strcmp(p, ".html") || !strcmp(p, ".HTML")) + htmlFileName = new GString(tmp->getCString(), + tmp->getLength() - 5); + else htmlFileName =new GString(tmp); +- else ++ } else if (xml) { + if (!strcmp(p, ".xml") || !strcmp(p, ".XML")) + htmlFileName = new GString(tmp->getCString(), + tmp->getLength() - 5); + else htmlFileName =new GString(tmp); ++ } else if (text) { ++ if (!strcmp(p, ".txt") || !strcmp(p, ".TXT")) ++ htmlFileName = new GString(tmp->getCString(), ++ tmp->getLength() - 5); ++ else htmlFileName =new GString(tmp); ++ } ++ + + delete tmp; + } else { +@@ -210,7 +220,7 @@ + mode=gFalse; + } + +- if (xml) ++ if (xml || text) + { + mode = gTrue; + noframes = gTrue; +@@ -237,7 +247,7 @@ + if (htmlOut->isOk()) + doc->displayPages(htmlOut, firstPage, lastPage, static_cast<int>(72*scale), 0, gTrue); + +- if( mode && !xml && !ignore ) { ++ if( mode && !xml && !ignore && !text) { + int h=xoutRound(htmlOut->getPageHeight()/scale); + int w=xoutRound(htmlOut->getPageWidth()/scale); + //int h=xoutRound(doc->getPageHeight(1)/scale); >Release-Note: >Audit-Trail: >Unformatted: To Unsubscribe: send mail to majordomo@FreeBSD.org with "unsubscribe freebsd-ports" in the body of the message
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200301041414.h04EEV4B070967>