Monday, February 20, 2012

Kindle Formatting 3: Using Java to create the toc.ncx entries

The toc.ncx file is used by Kindles to navigate. The java file NcxGenTOC.java is shown below:

The three arguments I used were:

  • toc-input.txt [a snippet where I copied the MsoToc1 lines from mynovel.htm for input. I suppose you COULD use mynovel.htm, but you will have to remove the first entry because it is the text "Table of Contents". You'll see what I mean when you bring mynovel.htm into Notepad++
  • mynovel
  • mynovel.htm
The output files are used to cut and paste into the real toc.ncx file.
  • toc-ncx.txt
  • toc-html.txt [unused because I use Calibre later]
Take the toc-ncx.txt file and cut/paste its contents into the space of your ncx template that you created in Step 2.

Import this Java code into your Eclipse Workspace and run it with Configuration: toc-input.txt, mynovel, mynovel.htm

import java.io.*;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


/** Generate Table Of Contents files, toc-ncx.txt and toc-html.txt for cut/paste into toc.ncx and toc.htm for KindleGen (beware of special characters like "smart quotes" that need to be corrected manually */ 


public final class NcxGenTOC {


/** Requires three arguments - file with cut/paste of MsoToc1 entries, and document title, document html filename */ 


public static void main(String... aArgs) throws IOException { 
String inFileName = aArgs[0];
String docTitle = aArgs[1];
String htmlFileName = aArgs[2];
NcxGenTOC test = new NcxGenTOC( inFileName, docTitle, htmlFileName ); test.convert();
}


/** Constructor. */
NcxGenTOC(String aFileName, String aDocTitle, String aHtmlFileName){ 
fFileName = aFileName; 
fDocTitle = aDocTitle;
fHtmlFileName = aHtmlFileName;
}


/** Generate the cut/paste files */


void convert() throws IOException {
String header = new String();
String headerRaw;
String refPt = new String();
StringBuilder line = new StringBuilder();
StringBuilder htmlLine = new StringBuilder();
log("creating output file.");


Writer out = new OutputStreamWriter(new FileOutputStream(outfile), encoding); 
Writer htmlOut = new OutputStreamWriter(new FileOutputStream(tocHtml), encoding);  
log("Reading from file: " + fFileName);


String text = new String();
String NL = System.getProperty("line.separator"); 
Scanner scanner = new Scanner(new FileInputStream(fFileName), "UTF-8"); 


out.write("<docTitle>" + NL);
line.append("<text>");
line.append(fDocTitle);
line.append("</text>");
out.write(line + NL);
out.write("</docTitle>" + NL);
out.write("<navMap>" + NL);
Pattern tocRef = Pattern.compile("\"([^\"]*)\"");
Pattern tocHead = Pattern.compile("\">[^(\">)^(</)]*</");
String contentSrc = new String("<content src=\"");
String navPointStart = new String ("<navPoint id=\"navPoint-");
String navPointPlay = new String ("\" playOrder=\"");
String navPointEnd = new String ("\">");


Integer i = 0;
try {
while (scanner.hasNextLine()){
text = scanner.nextLine();


// Specific for MSFT Word generated entries, tweak as needed
if(text.startsWith("<div class="MsoToc1">")) {
i++;
log(i.toString() + ": " + text);


line.setLength(0);
line.append(navPointStart);
line.append(i.toString());
line.append(navPointPlay);
line.append(i.toString());
line.append(navPointEnd + NL);
out.write(line + NL);


Matcher mH = tocHead.matcher(text);
while (mH.find()) {
headerRaw = new String(mH.group(0));
header = headerRaw.substring(2,headerRaw.length()-2);
//strip the first and last two characters
log(header);
line.setLength(0);
line.append("<navLabel>" + NL);
line.append("<text>");
line.append(header);
line.append("</text>" + NL);
line.append("</navLabel>" + NL);
out.write(line + NL);
} //end while


Matcher mR = tocRef.matcher(text);
while (mR.find()) {
line.setLength(0);
line.append(contentSrc);
line.append(fHtmlFileName);
refPt = mR.group(1);
log(refPt);
line.append(refPt);
line.append("\"/>" + NL);
line.append("</navPoint>" + NL);
out.write(line + NL);
} //end while


htmlLine.setLength(0);
htmlLine.append("<a href=\"");
htmlLine.append(fHtmlFileName);
htmlLine.append(refPt);
htmlLine.append("\">");
htmlLine.append(header);
htmlLine.append("</a><br />");
htmlOut.write(htmlLine + NL);
} //end if
} //end while
} //end try
finally {
scanner.close();
out.write("</navMap>" + NL);
out.close();
htmlOut.close();
}
log("Files written: " + outfile + " and " + tocHtml);
} // end convert
// PRIVATE
private final String fFileName;
private final String fDocTitle;
private final String fHtmlFileName;
String encoding = "UTF-8";
String outfile = "toc-ncx.txt";
String tocHtml = "toc-html.txt";
private void log(String aMessage){
System.out.println(aMessage);
}
}

No comments:

Post a Comment