Deprecated Modules#
The mupdf/tasks and mupdf/mupdfjs sub-modules have been deprecated and are no longer available.
The functions that the tasks and mupdfjs modules provided are now available as simple examples that you can copy into your own projects if you need them.
From andytango/mupdf-js#
To help you migrate from the https://github.com/andytango/mupdf-js library, you can copy the tasks.ts file into your project.
examples/tasks/tasks.ts#
import * as mupdf from "mupdf"
export function loadPDF(data: Buffer | ArrayBuffer | Uint8Array) {
return new mupdf.PDFDocument(data)
}
export function drawPageAsPNG(document: mupdf.PDFDocument, pageNumber: number, dpi: number): Uint8Array {
const page = document.loadPage(pageNumber)
const zoom = dpi / 72
return page.toPixmap(
[zoom, 0, 0, zoom, 0, 0],
mupdf.ColorSpace.DeviceRGB
).asPNG()
}
export function drawPageAsHTML(document: mupdf.PDFDocument, pageNumber: number, id: number): string {
return document.loadPage(pageNumber).toStructuredText().asHTML(id)
}
export function drawPageAsSVG(document: mupdf.PDFDocument, pageNumber: number): string {
const page = document.loadPage(pageNumber)
const buffer = new mupdf.Buffer()
const writer = new mupdf.DocumentWriter(buffer, "svg", "")
const device = writer.beginPage(page.getBounds())
page.run(device, mupdf.Matrix.identity)
device.close()
writer.endPage()
return buffer.asString()
}
export function getPageText(document: mupdf.PDFDocument, pageNumber: number): string {
return document.loadPage(pageNumber).toStructuredText().asText()
}
export function searchPageText(document: mupdf.PDFDocument, pageNumber: number, searchString: string, maxHits = 500) {
return document.loadPage(pageNumber).toStructuredText().search(searchString, maxHits)
}
From mupdfjs#
Splitting a Document#
examples/tasks/pdf-split.ts#
import * as mupdf from "mupdf"
export function split(document: mupdf.PDFDocument, range: number[] | undefined) {
let documents: mupdf.PDFDocument[] = [];
if (range == undefined || range.length == 0) { // just split out all pages as single PDFs
for (let i = 0; i < document.countPages(); ++i) {
let newDoc: mupdf.PDFDocument = new mupdf.PDFDocument()
newDoc.graftPage(0, document, i);
documents.push(newDoc);
}
} else {
// we have a defined page ranges to consider, create the correct PDFs
// build range arrays according to input
let ranges: number[][] = [];
for (let i = 0; i < range.length; ++i) {
var a: number = range[i] as number;
if (a < 0)
throw new Error("Split error: document page indexes cannot be less than zero");
var nextIndex: number = i + 1;
var b: number;
if (nextIndex > range.length - 1)
b = document.countPages();
else
b = range[nextIndex] as number;
var set: number[] = [];
for (; a < b; ++a)
set.push(a);
ranges.push(set);
}
// now cycle the ranges and create the new documents as required
for (let n = 0; n < ranges.length; ++n) {
let newDoc = new mupdf.PDFDocument()
let graftMap = newDoc.newGraftMap()
if (ranges[n] != undefined) {
for (let o: number = 0; o < ranges[n]!.length; o++) {
// note: "o" is the "to" number for graftPage()
graftMap.graftPage(o, document, ranges[n]![o]!);
}
documents.push(newDoc);
}
}
}
return documents
}
Merging Documents#
examples/tasks/pdf-merge.ts#
import * as mupdf from "mupdf"
function copyPage(dstDoc, srcDoc, pageNumber, insertAt, dstFromSrc) {
var srcPage = srcDoc.findPage(pageNumber)
var dstPage = dstDoc.newDictionary()
dstPage.put("Type", dstDoc.newName("Page"))
if (srcPage.get("MediaBox"))
dstPage.put("MediaBox", dstFromSrc.graftObject(srcPage.get("MediaBox")))
if (srcPage.get("Rotate"))
dstPage.put("Rotate", dstFromSrc.graftObject(srcPage.get("Rotate")))
if (srcPage.get("Resources"))
dstPage.put("Resources", dstFromSrc.graftObject(srcPage.get("Resources")))
if (srcPage.get("Contents"))
dstPage.put("Contents", dstFromSrc.graftObject(srcPage.get("Contents")))
dstDoc.insertPage(insertAt, dstDoc.addObject(dstPage))
}
export function merge(targetPDF, sourcePDF, fromPage = 0, toPage = -1, startAt = -1) {
const sourcePageCount = sourcePDF.countPages()
const targetPageCount = targetPDF.countPages()
const graftMap = targetPDF.newGraftMap()
fromPage = Math.max(0, Math.min(fromPage, sourcePageCount - 1))
toPage = toPage < 0 ? sourcePageCount - 1 : Math.min(toPage, sourcePageCount - 1)
startAt = startAt < 0 ? targetPageCount : Math.min(startAt, targetPageCount)
for (let i = fromPage; i <= toPage; i++)
copyPage(targetPDF, sourcePDF, i, startAt + (i-fromPage), graftMap)
}
Scrubbing a Document#
examples/tasks/pdf-scrub.ts#
import * as mupdf from "mupdf"
export function scrub(
document: mupdf.PDFDocument,
options: {
attachedFiles?: boolean,
cleanPages?: boolean,
embeddedFiles?: boolean,
hiddenText?: boolean,
javascript?: boolean,
metadata?: boolean,
redactions?: boolean,
redactImages?: number,
removeLinks?: boolean,
resetFields?: boolean,
resetResponses?: boolean,
thumbnails?: boolean,
xmlMetadata?: boolean,
}
): void {
const {
attachedFiles = true,
cleanPages = true,
embeddedFiles = true,
hiddenText = true,
javascript = true,
metadata = true,
redactions = true,
redactImages = 0,
removeLinks = true,
resetFields = true,
resetResponses = true,
thumbnails = true,
xmlMetadata = true,
} = options
// Basic validation
if (!document.isPDF()) {
throw new Error("is not PDF")
}
if (document.needsPassword()) {
throw new Error("encrypted doc")
}
// Metadata cleaning
if (metadata) {
// Clear all standard PDF metadata fields
document.setMetaData("info:Title", "")
document.setMetaData("info:Author", "")
document.setMetaData("info:Subject", "")
document.setMetaData("info:Keywords", "")
document.setMetaData("info:Creator", "")
document.setMetaData("info:Producer", "")
document.setMetaData("info:CreationDate", "")
document.setMetaData("info:ModDate", "")
}
// Process each page
const pageCount = document.countPages()
for (let i = 0; i < pageCount; i++) {
// Remove links
if (removeLinks) {
const page = document.loadPage(i)
const links = page.getLinks()
for (const link of links) {
page.deleteLink(link)
}
}
// Handle attached files
if (attachedFiles) {
const page = document.loadPage(i)
const annotations = page.getAnnotations()
for (const annot of annotations) {
if (annot.getType() === "FileAttachment") {
annot.setFileSpec(document.newNull())
}
}
}
// Clean pages
if (cleanPages) {
const cleanBuffer = document.saveToBuffer("clean=yes")
const cleanDoc = mupdf.PDFDocument.openDocument(cleanBuffer, "application/pdf") as mupdf.PDFDocument
// Copy all objects from the cleaned document back to this document
const pageCount = cleanDoc.countPages()
for (let j = 0; j < pageCount; j++) {
const cleanPage = cleanDoc.loadPage(j)
const cleanPageObj = cleanPage.getObject()
const thisPage = document.loadPage(j)
const thisPageObj = thisPage.getObject()
thisPageObj.put("Contents", document.graftObject(cleanPageObj.get("Contents")))
}
}
// Handle hidden text
if (hiddenText) {
// TODO: Implement hidden text removal
}
// Handle redactions
if (redactions) {
// TODO: Implement redactions
if (redactImages >= 0) {
// TODO: Handle redacted images
}
}
// Reset form fields
if (resetFields) {
const page = document.loadPage(i)
const widgets = page.getWidgets()
for (const widget of widgets) {
const widgetObj = widget.getObject()
// Get default value
const defaultValue = widgetObj.get("DV")
// Reset value
if (defaultValue.isNull()) {
widgetObj.delete("V")
} else {
widgetObj.put("V", defaultValue)
}
// Update appearance state for checkboxes and radio buttons
const widgetType = widget.getFieldType()
if (widgetType === "checkbox" || widgetType === "radiobutton") {
widgetObj.put("AS", defaultValue.isNull() ? document.newName("Off") : defaultValue)
}
widget.update()
}
}
// Reset responses
if (resetResponses) {
const page = document.loadPage(i)
const annotations = page.getAnnotations()
for (const annot of annotations) {
const annotObj = annot.getObject()
// Remove response type and in-response-to reference
annotObj.delete("RT")
annotObj.delete("IRT")
annot.update()
}
}
// Remove thumbnails
if (thumbnails) {
const page = document.loadPage(i)
const pageObj = page.getObject()
pageObj.delete("Thumb")
}
}
// Handle embedded files
if (embeddedFiles) {
const root = document.getTrailer().get("Root")
const names = root.get("Names")
if (!names.isNull() && names.isDictionary()) {
const embeddedFilesDict = names.get("EmbeddedFiles")
if (!embeddedFilesDict.isNull() && embeddedFilesDict.isDictionary()) {
const emptyArray = document.newArray()
embeddedFilesDict.put("Names", emptyArray)
}
}
}
// Handle JavaScript
if (javascript) {
const xrefLength = document.countObjects()
for (let xref = 1; xref < xrefLength; xref++) {
const obj = document.newIndirect(xref)
const resolvedObj = obj.resolve()
if (resolvedObj.isDictionary()) {
const type = resolvedObj.get("S")
if (!type.isNull() && type.asName() === "JavaScript") {
const newObj = document.newDictionary()
newObj.put("S", document.newName("JavaScript"))
newObj.put("JS", document.newString(""))
obj.writeObject(newObj)
}
}
}
}
// Handle XML metadata
if (xmlMetadata) {
const root = document.getTrailer().get("Root")
root.delete("Metadata")
}
}
Adding Text to Pages#
examples/tasks/page-insert-text.ts#
import * as mupdf from "mupdf"
export function insertText(
doc: mupdf.PDFDocument,
page: mupdf.PDFPage,
value: string,
point: mupdf.Point,
fontName: string = "Times-Roman",
fontSize: number = 18,
graphics: {
strokeColor: mupdf.Color,
fillColor: mupdf.Color,
strokeThickness: number
} = { strokeColor: [0, 0, 0, 1], fillColor: [0, 0, 0, 1], strokeThickness: 1 }
) {
let page_obj = page.getObject()
let font = new mupdf.Font(fontName)
let fontResource = doc.addSimpleFont(font)
// add object to page/Resources/XObject/F1 dictionary (creating nested dictionaries as needed)
var resources = page_obj.get("Resources")
if (!resources.isDictionary())
page_obj.put("Resources", resources = doc.newDictionary())
var res_font = resources.get("Font")
if (!res_font.isDictionary())
resources.put("Font", res_font = doc.newDictionary())
res_font.put("F1", fontResource)
// format this for the PDF markup language
// this guards against people not sending through the complete parameter set in their "graphics" object
// i.e. maybe they send just one or two of them, not all three
if (graphics.strokeColor == undefined) {
graphics.strokeColor = [0, 0, 0, 1]
}
if (graphics.fillColor == undefined) {
graphics.fillColor = [0, 0, 0, 1]
}
if (graphics.strokeThickness == undefined) {
graphics.strokeThickness = 1
}
if (graphics.strokeColor[3] == undefined) {
graphics.strokeColor[3] = 1
}
if (graphics.fillColor[3] == undefined) {
graphics.fillColor[3] = 1
}
let strokeColor: string = graphics.strokeColor[0] + " " + graphics.strokeColor[1] + " " + graphics.strokeColor[2] + " RG"
let fillColor: string = graphics.fillColor[0] + " " + graphics.fillColor[1] + " " + graphics.fillColor[2] + " rg"
let strokeOpacity: string = (graphics.strokeColor[3] * 100).toString()
let fillOpacity: string = (graphics.fillColor[3] * 100).toString()
let strokeThicknessMarkup = "2 Tr " + graphics.strokeThickness + " w"
if (graphics.strokeThickness == 0) {
strokeThicknessMarkup = ""
}
// add the graphics state object to the resources dictionary
var res_graphics_state = resources.get("ExtGState")
if (!res_graphics_state.isDictionary())
resources.put("ExtGState", res_graphics_state = doc.newDictionary())
var graphicsDict = doc.newDictionary()
graphicsDict.put("CA", graphics.strokeColor[3])
graphicsDict.put("ca", graphics.fillColor[3])
let graphicsStateIdentifier: string = "fitzca" + strokeOpacity + "" + fillOpacity
res_graphics_state.put(graphicsStateIdentifier, graphicsDict)
let graphicsState: string = "/" + graphicsStateIdentifier + " gs"
// invert the Y point
point[1] = page.getBounds()[3] - (point[1] + fontSize);
let contentStream: string = "q " + graphicsState + " BT " + strokeColor + " " + fillColor + " " + strokeThicknessMarkup + " /F1 " + fontSize + " Tf " + point[0] + " " + point[1] + " Td (" + value + ") Tj ET Q"
console.log(`Inserting text to page with content stream:\n${contentStream}`)
// Create drawing operations
var extra_contents = doc.addStream(contentStream, {})
// Add drawing operations to page contents
var page_contents = page_obj.get("Contents")
if (page_contents.isNull()) {
page_obj.put("Contents", extra_contents)
}
else if (page_contents.isArray()) {
// Contents is already an array, so append our new buffer object.
page_contents.push(extra_contents)
} else {
// Contents is not an array, so change it into an array
// and then append our new buffer object.
var new_page_contents = doc.newArray()
new_page_contents.push(page_contents)
new_page_contents.push(extra_contents)
page_obj.put("Contents", new_page_contents)
}
}
Adding Images to Pages#
examples/tasks/page-insert-image.ts#
import * as mupdf from "mupdf"
export function insertImage(
doc: mupdf.PDFDocument,
page: mupdf.PDFPage,
data: { image: mupdf.Image, name: string },
metrics: { x?: number, y?: number, width?: number, height?: number } = { x: 0, y: 0, width: 0, height: 0 }
) {
if (data.image == null) {
throw new Error("Invalid image");
}
if (data.name == null || data.name.length == 0) {
throw new Error("Invalid name");
}
let page_obj = page.getObject()
// add image object to page/Resources/XObject/MyCats dictionary (creating nested dictionaries as needed)
var res = page_obj.get("Resources")
if (!res.isDictionary())
page_obj.put("Resources", res = doc.newDictionary())
var res_xobj = res.get("XObject")
if (!res_xobj.isDictionary())
res.put("XObject", res_xobj = doc.newDictionary())
const image = doc.addImage(data.image)
// source some metrics data from sensible defaults if it isn't provided
if (metrics.width == 0 || metrics.width == undefined) {
metrics.width = data.image.getWidth()
}
if (metrics.height == 0 || metrics.height == undefined) {
metrics.height = data.image.getHeight()
}
if (metrics.x == undefined) {
metrics.x = 0
}
// invert the Y point
if (metrics.y == undefined) {
metrics.y = page.getBounds()[3] - metrics.height;
} else {
metrics.y = page.getBounds()[3] - (metrics.y + metrics.height);
}
res_xobj.put(data.name, image)
let contentStream: string = "q " + metrics.width + " 0 0 " + metrics.height + " " + metrics.x + " " + metrics.y + " cm /" + data.name + " Do Q"
console.log(`Inserting image to page with content stream:\n${contentStream}`)
// create drawing operations
var extra_contents = doc.addStream(contentStream, null)
// add drawing operations to page contents
var page_contents = page_obj.get("Contents")
if (page_contents.isNull()) {
page_obj.put("Contents", extra_contents)
}
else if (page_contents.isArray()) {
// Contents is already an array, so append our new buffer object.
page_contents.push(extra_contents)
} else {
// Contents is not an array, so change it into an array
// and then append our new buffer object.
var new_page_contents = doc.newArray()
new_page_contents.push(page_contents)
new_page_contents.push(extra_contents)
page_obj.put("Contents", new_page_contents)
}
}
Extracting Document Images and Text#
examples/tasks/page-words.ts#
import * as mupdf from "mupdf"
export type MyWord = {
rect: mupdf.Rect,
text: string,
font: mupdf.Font,
size: number,
};
export function getPageWords(page: mupdf.PDFPage): MyWord[] {
const words: MyWord[] = []
let cwordRect: mupdf.Rect | undefined
let cwordFont: mupdf.Font | undefined
let cwordSize: number | undefined
let cwordText = ""
const endWord = () => {
// if word is complete, append to list
if (cwordRect !== undefined && cwordFont !== undefined && cwordSize !== undefined && cwordText !== "") {
words.push({
rect: cwordRect,
text: cwordText,
font: cwordFont,
size: cwordSize,
})
}
// Reset values
cwordRect = undefined
cwordFont = undefined
cwordSize = undefined
cwordText = ""
}
const enlargeRect = (quad: mupdf.Quad) => {
if (cwordRect === undefined) {
cwordRect = [ quad[0], quad[1], quad[6], quad[7] ]
return
}
cwordRect[0] = Math.min(cwordRect[0], quad[0])
cwordRect[1] = Math.min(cwordRect[1], quad[1])
cwordRect[2] = Math.max(cwordRect[2], quad[6])
cwordRect[3] = Math.max(cwordRect[3], quad[7])
}
// extract the words from the page
page.toStructuredText("preserve-whitespace,preserve-spans").walk({
onChar(c, _origin, font, size, quad) {
enlargeRect(quad)
cwordFont = font
cwordSize = size
// split by whitespace
if (c == " ") {
endWord()
} else {
cwordText += c
}
},
// split by block
endLine: endWord,
endTextBlock: endWord,
})
return words
}
export function getPageImages(page: mupdf.PDFPage): { bbox: mupdf.Rect; matrix: mupdf.Matrix; image: mupdf.Image }[] {
var images: { bbox: mupdf.Rect; matrix: mupdf.Matrix; image: mupdf.Image }[] = []
page.toStructuredText("preserve-images").walk({
onImageBlock(bbox, matrix, image) {
images.push({ bbox: bbox, matrix: matrix, image: image })
},
})
return images
}
Managing Page Resources & XObjects#
examples/tasks/page-resources-xobject.ts#
import * as mupdf from "mupdf"
export function getPageResourcesXObjects(page: mupdf.PDFPage): { key: string | number; value: string }[] {
let pageObj = page.getObject()
var isIndirect = pageObj.isIndirect()
if (isIndirect) {
pageObj = pageObj.resolve()
}
let res = pageObj.get("Resources")
let resXObj = res.get("XObject")
let arr: { key: string | number; value: string }[] = []
resXObj.forEach(function (value: mupdf.PDFObject, key: string | number) {
arr.push({ key: key, value: value.toString() })
})
return arr
}
export function deletePageResourcesXObject(doc: mupdf.PDFDocument, page: mupdf.PDFPage, ref: string) {
let pageObj = page.getObject()
var isIndirect = pageObj.isIndirect()
if (isIndirect) {
pageObj = pageObj.resolve()
}
// replace the XObject with a 1x1 transparent pixel to "delete" it
let res = pageObj.get("Resources")
let resXObj = res.get("XObject")
let pix = new mupdf.Pixmap(mupdf.ColorSpace.DeviceRGB, [ 0, 0, 1, 1 ], true)
let imageRes = new mupdf.Image(pix)
const image = doc.addImage(imageRes)
resXObj.put(ref, image)
res.put("XObject", resXObj)
pageObj.put("Resources", res)
}