Deprecated Modules#

The mupdf/tasks and mupdf/mupdfjs sub-modules have been deprecated and are no longer available.

The functions that the tasks and mupdfjs modules provided are now available as simple examples that you can copy into your own projects if you need them.

From `andytango/mupdf-js`#

To help you migrate from the https://github.com/andytango/mupdf-js library, you can copy the tasks.ts file into your project.

examples/tasks/tasks.ts#

import * as mupdf from "mupdf"

export function loadPDF(data: Buffer | ArrayBuffer | Uint8Array) {
    return new mupdf.PDFDocument(data)
}

export function drawPageAsPNG(document: mupdf.PDFDocument, pageNumber: number, dpi: number): Uint8Array {
    const page = document.loadPage(pageNumber)
    const zoom = dpi / 72

    return page.toPixmap(
      [zoom, 0, 0, zoom, 0, 0],
      mupdf.ColorSpace.DeviceRGB
    ).asPNG()
}

export function drawPageAsHTML(document: mupdf.PDFDocument, pageNumber: number, id: number): string {
    return document.loadPage(pageNumber).toStructuredText().asHTML(id)
}

export function drawPageAsSVG(document: mupdf.PDFDocument, pageNumber: number): string {
    const page = document.loadPage(pageNumber)
    const buffer = new mupdf.Buffer()
    const writer = new mupdf.DocumentWriter(buffer, "svg", "")
    const device = writer.beginPage(page.getBounds())
    page.run(device, mupdf.Matrix.identity)
    device.close()
    writer.endPage()
    return buffer.asString()
}

export function getPageText(document: mupdf.PDFDocument, pageNumber: number): string {
    return document.loadPage(pageNumber).toStructuredText().asText()
}

export function searchPageText(document: mupdf.PDFDocument, pageNumber: number, searchString: string, maxHits = 500) {
    return document.loadPage(pageNumber).toStructuredText().search(searchString, maxHits)
}

From `mupdfjs`#

Splitting a Document#

examples/tasks/pdf-split.ts#

import * as mupdf from "mupdf"

export function split(document: mupdf.PDFDocument, range: number[] | undefined) {
	let documents: mupdf.PDFDocument[] = [];
	if (range == undefined || range.length == 0) { // just split out all pages as single PDFs
		for (let i = 0; i < document.countPages(); ++i) {
			let newDoc: mupdf.PDFDocument = new mupdf.PDFDocument()
			newDoc.graftPage(0, document, i);
			documents.push(newDoc);
		}
	} else {
		// we have a defined page ranges to consider, create the correct PDFs
		// build range arrays according to input 
		let ranges: number[][] = [];
		for (let i = 0; i < range.length; ++i) {
			var a: number = range[i] as number;
			if (a < 0)
				throw new Error("Split error: document page indexes cannot be less than zero");
			var nextIndex: number = i + 1;
			var b: number;
			if (nextIndex > range.length - 1)
				b = document.countPages();
			else
				b = range[nextIndex] as number;
			var set: number[] = [];
			for (; a < b; ++a)
				set.push(a);
			ranges.push(set);
		}
		// now cycle the ranges and create the new documents as required
		for (let n = 0; n < ranges.length; ++n) {
			let newDoc = new mupdf.PDFDocument()
			let graftMap = newDoc.newGraftMap()
			if (ranges[n] != undefined) {
				for (let o: number = 0; o < ranges[n]!.length; o++) {
					// note: "o" is the "to" number for graftPage()
					graftMap.graftPage(o, document, ranges[n]![o]!);
				}
				documents.push(newDoc);
			}
		}
	}
	return documents
}

Merging Documents#

examples/tasks/pdf-merge.ts#

import * as mupdf from "mupdf"

function copyPage(dstDoc, srcDoc, pageNumber, insertAt, dstFromSrc) {
	var srcPage = srcDoc.findPage(pageNumber)
	var dstPage = dstDoc.newDictionary()
	dstPage.put("Type", dstDoc.newName("Page"))
	if (srcPage.get("MediaBox"))
		dstPage.put("MediaBox", dstFromSrc.graftObject(srcPage.get("MediaBox")))
	if (srcPage.get("Rotate"))
		dstPage.put("Rotate", dstFromSrc.graftObject(srcPage.get("Rotate")))
	if (srcPage.get("Resources"))
		dstPage.put("Resources", dstFromSrc.graftObject(srcPage.get("Resources")))
	if (srcPage.get("Contents"))
		dstPage.put("Contents", dstFromSrc.graftObject(srcPage.get("Contents")))
	dstDoc.insertPage(insertAt, dstDoc.addObject(dstPage))
}

export function merge(targetPDF, sourcePDF, fromPage = 0, toPage = -1, startAt = -1) {
	const sourcePageCount = sourcePDF.countPages()
	const targetPageCount = targetPDF.countPages()
	const graftMap = targetPDF.newGraftMap()
	fromPage = Math.max(0, Math.min(fromPage, sourcePageCount - 1))
	toPage = toPage < 0 ? sourcePageCount - 1 : Math.min(toPage, sourcePageCount - 1)
	startAt = startAt < 0 ? targetPageCount : Math.min(startAt, targetPageCount)
	for (let i = fromPage; i <= toPage; i++)
		copyPage(targetPDF, sourcePDF, i, startAt + (i-fromPage), graftMap)
}

Scrubbing a Document#

examples/tasks/pdf-scrub.ts#

import * as mupdf from "mupdf"

export function scrub(
	document: mupdf.PDFDocument,
	options: {
		attachedFiles?: boolean,
		cleanPages?: boolean,
		embeddedFiles?: boolean,
		hiddenText?: boolean,
		javascript?: boolean,
		metadata?: boolean,
		redactions?: boolean,
		redactImages?: number,
		removeLinks?: boolean,
		resetFields?: boolean,
		resetResponses?: boolean,
		thumbnails?: boolean,
		xmlMetadata?: boolean,
	}
): void {
	const {
		attachedFiles = true,
		cleanPages = true,
		embeddedFiles = true,
		hiddenText = true,
		javascript = true,
		metadata = true,
		redactions = true,
		redactImages = 0,
		removeLinks = true,
		resetFields = true,
		resetResponses = true,
		thumbnails = true,
		xmlMetadata = true,
	} = options

	// Basic validation
	if (!document.isPDF()) {
		throw new Error("is not PDF")
	}

	if (document.needsPassword()) {
		throw new Error("encrypted doc")
	}

	// Metadata cleaning
	if (metadata) {
		// Clear all standard PDF metadata fields
		document.setMetaData("info:Title", "")
		document.setMetaData("info:Author", "")
		document.setMetaData("info:Subject", "")
		document.setMetaData("info:Keywords", "")
		document.setMetaData("info:Creator", "")
		document.setMetaData("info:Producer", "")
		document.setMetaData("info:CreationDate", "")
		document.setMetaData("info:ModDate", "")
	}

	// Process each page
	const pageCount = document.countPages()
	for (let i = 0; i < pageCount; i++) {
		// Remove links
		if (removeLinks) {
			const page = document.loadPage(i)
			const links = page.getLinks()
			for (const link of links) {
				page.deleteLink(link)
			}
		}

		// Handle attached files
		if (attachedFiles) {
			const page = document.loadPage(i)
			const annotations = page.getAnnotations()
			for (const annot of annotations) {
				if (annot.getType() === "FileAttachment") {
					annot.setFileSpec(document.newNull())
				}
			}
		}

		// Clean pages
		if (cleanPages) {
			const cleanBuffer = document.saveToBuffer("clean=yes")
			const cleanDoc = mupdf.PDFDocument.openDocument(cleanBuffer, "application/pdf") as mupdf.PDFDocument
			// Copy all objects from the cleaned document back to this document
			const pageCount = cleanDoc.countPages()
			for (let j = 0; j < pageCount; j++) {
				const cleanPage = cleanDoc.loadPage(j)
				const cleanPageObj = cleanPage.getObject()
				const thisPage = document.loadPage(j)
				const thisPageObj = thisPage.getObject()
				thisPageObj.put("Contents", document.graftObject(cleanPageObj.get("Contents")))
			}
		}

		// Handle hidden text
		if (hiddenText) {
			// TODO: Implement hidden text removal
		}

		// Handle redactions
		if (redactions) {
			// TODO: Implement redactions
			if (redactImages >= 0) {
				// TODO: Handle redacted images
			}
		}

		// Reset form fields
		if (resetFields) {
			const page = document.loadPage(i)
			const widgets = page.getWidgets()
			for (const widget of widgets) {
				const widgetObj = widget.getObject()
				// Get default value
				const defaultValue = widgetObj.get("DV")
				// Reset value
				if (defaultValue.isNull()) {
					widgetObj.delete("V")
				} else {
					widgetObj.put("V", defaultValue)
				}
				// Update appearance state for checkboxes and radio buttons
				const widgetType = widget.getFieldType()
				if (widgetType === "checkbox" || widgetType === "radiobutton") {
					widgetObj.put("AS", defaultValue.isNull() ? document.newName("Off") : defaultValue)
				}
				widget.update()
			}
		}

		// Reset responses
		if (resetResponses) {
			const page = document.loadPage(i)
			const annotations = page.getAnnotations()
			for (const annot of annotations) {
				const annotObj = annot.getObject()
				// Remove response type and in-response-to reference
				annotObj.delete("RT")
				annotObj.delete("IRT")
				annot.update()
			}
		}

		// Remove thumbnails
		if (thumbnails) {
			const page = document.loadPage(i)
			const pageObj = page.getObject()
			pageObj.delete("Thumb")
		}
	}

	// Handle embedded files
	if (embeddedFiles) {
		const root = document.getTrailer().get("Root")
		const names = root.get("Names")
		if (!names.isNull() && names.isDictionary()) {
			const embeddedFilesDict = names.get("EmbeddedFiles")
			if (!embeddedFilesDict.isNull() && embeddedFilesDict.isDictionary()) {
				const emptyArray = document.newArray()
				embeddedFilesDict.put("Names", emptyArray)
			}
		}
	}

	// Handle JavaScript
	if (javascript) {
		const xrefLength = document.countObjects()
		for (let xref = 1; xref < xrefLength; xref++) {
			const obj = document.newIndirect(xref)
			const resolvedObj = obj.resolve()
			if (resolvedObj.isDictionary()) {
				const type = resolvedObj.get("S")
				if (!type.isNull() && type.asName() === "JavaScript") {
					const newObj = document.newDictionary()
					newObj.put("S", document.newName("JavaScript"))
					newObj.put("JS", document.newString(""))
					obj.writeObject(newObj)
				}
			}
		}
	}

	// Handle XML metadata
	if (xmlMetadata) {
		const root = document.getTrailer().get("Root")
		root.delete("Metadata")
	}
}

Adding Text to Pages#

examples/tasks/page-insert-text.ts#

import * as mupdf from "mupdf"

export function insertText(
	doc: mupdf.PDFDocument,
	page: mupdf.PDFPage,
	value: string,
	point: mupdf.Point,
	fontName: string = "Times-Roman",
	fontSize: number = 18,
	graphics: {
		strokeColor: mupdf.Color,
		fillColor: mupdf.Color,
		strokeThickness: number
	} = { strokeColor: [0, 0, 0, 1], fillColor: [0, 0, 0, 1], strokeThickness: 1 }
) {
	let page_obj = page.getObject()
	let font = new mupdf.Font(fontName)
	let fontResource = doc.addSimpleFont(font)

	// add object to page/Resources/XObject/F1 dictionary (creating nested dictionaries as needed)
	var resources = page_obj.get("Resources")
	if (!resources.isDictionary())
		page_obj.put("Resources", resources = doc.newDictionary())

	var res_font = resources.get("Font")
	if (!res_font.isDictionary())
		resources.put("Font", res_font = doc.newDictionary())

	res_font.put("F1", fontResource)

	// format this for the PDF markup language

	// this guards against people not sending through the complete parameter set in their "graphics" object 
	// i.e. maybe they send just one or two of them, not all three
	if (graphics.strokeColor == undefined) {
		graphics.strokeColor = [0, 0, 0, 1]
	}

	if (graphics.fillColor == undefined) {
		graphics.fillColor = [0, 0, 0, 1]
	}

	if (graphics.strokeThickness == undefined) {
		graphics.strokeThickness = 1
	}

	if (graphics.strokeColor[3] == undefined) {
		graphics.strokeColor[3] = 1
	}

	if (graphics.fillColor[3] == undefined) {
		graphics.fillColor[3] = 1
	}

	let strokeColor: string = graphics.strokeColor[0] + " " + graphics.strokeColor[1] + " " + graphics.strokeColor[2] + " RG"
	let fillColor: string = graphics.fillColor[0] + " " + graphics.fillColor[1] + " " + graphics.fillColor[2] + " rg"
	let strokeOpacity: string = (graphics.strokeColor[3] * 100).toString()
	let fillOpacity: string = (graphics.fillColor[3] * 100).toString()

	let strokeThicknessMarkup = "2 Tr " + graphics.strokeThickness + " w"

	if (graphics.strokeThickness == 0) {
		strokeThicknessMarkup = ""
	}

	// add the graphics state object to the resources dictionary
	var res_graphics_state = resources.get("ExtGState")
	if (!res_graphics_state.isDictionary())
		resources.put("ExtGState", res_graphics_state = doc.newDictionary())

	var graphicsDict = doc.newDictionary()
	graphicsDict.put("CA", graphics.strokeColor[3])
	graphicsDict.put("ca", graphics.fillColor[3])

	let graphicsStateIdentifier: string = "fitzca" + strokeOpacity + "" + fillOpacity
	res_graphics_state.put(graphicsStateIdentifier, graphicsDict)

	let graphicsState: string = "/" + graphicsStateIdentifier + " gs"

	// invert the Y point
	point[1] = page.getBounds()[3] - (point[1] + fontSize);

	let contentStream: string = "q " + graphicsState + " BT " + strokeColor + " " + fillColor + " " + strokeThicknessMarkup + " /F1 " + fontSize + " Tf " + point[0] + " " + point[1] + " Td (" + value + ") Tj ET Q"
	console.log(`Inserting text to page with content stream:\n${contentStream}`)

	// Create drawing operations
	var extra_contents = doc.addStream(contentStream, {})

	// Add drawing operations to page contents
	var page_contents = page_obj.get("Contents")

	if (page_contents.isNull()) {
		page_obj.put("Contents", extra_contents)
	}
	else if (page_contents.isArray()) {
		// Contents is already an array, so append our new buffer object.
		page_contents.push(extra_contents)
	} else {
		// Contents is not an array, so change it into an array
		// and then append our new buffer object.
		var new_page_contents = doc.newArray()
		new_page_contents.push(page_contents)
		new_page_contents.push(extra_contents)
		page_obj.put("Contents", new_page_contents)
	}
}

Adding Images to Pages#

examples/tasks/page-insert-image.ts#

import * as mupdf from "mupdf"

export function insertImage(
	doc: mupdf.PDFDocument,
	page: mupdf.PDFPage,
	data: { image: mupdf.Image, name: string },
	metrics: { x?: number, y?: number, width?: number, height?: number } = { x: 0, y: 0, width: 0, height: 0 }
) {
	if (data.image == null) {
		throw new Error("Invalid image");
	}

	if (data.name == null || data.name.length == 0) {
		throw new Error("Invalid name");
	}

	let page_obj = page.getObject()

	// add image object to page/Resources/XObject/MyCats dictionary (creating nested dictionaries as needed)
	var res = page_obj.get("Resources")
	if (!res.isDictionary())
		page_obj.put("Resources", res = doc.newDictionary())

	var res_xobj = res.get("XObject")
	if (!res_xobj.isDictionary())
		res.put("XObject", res_xobj = doc.newDictionary())

	const image = doc.addImage(data.image)

	// source some metrics data from sensible defaults if it isn't provided
	if (metrics.width == 0 || metrics.width == undefined) {
		metrics.width = data.image.getWidth()
	}

	if (metrics.height == 0 || metrics.height == undefined) {
		metrics.height = data.image.getHeight()
	}

	if (metrics.x == undefined) {
		metrics.x = 0
	}

	// invert the Y point
	if (metrics.y == undefined) {
		metrics.y = page.getBounds()[3] - metrics.height;
	} else {
		metrics.y = page.getBounds()[3] - (metrics.y + metrics.height);
	}

	res_xobj.put(data.name, image)

	let contentStream: string = "q " + metrics.width + " 0 0 " + metrics.height + " " + metrics.x + " " + metrics.y + " cm /" + data.name + " Do Q"

	console.log(`Inserting image to page with content stream:\n${contentStream}`)

	// create drawing operations
	var extra_contents = doc.addStream(contentStream, null)

	// add drawing operations to page contents
	var page_contents = page_obj.get("Contents")

	if (page_contents.isNull()) {
		page_obj.put("Contents", extra_contents)
	}
	else if (page_contents.isArray()) {
		// Contents is already an array, so append our new buffer object.
		page_contents.push(extra_contents)
	} else {
		// Contents is not an array, so change it into an array
		// and then append our new buffer object.
		var new_page_contents = doc.newArray()
		new_page_contents.push(page_contents)
		new_page_contents.push(extra_contents)
		page_obj.put("Contents", new_page_contents)
	}

}

Extracting Document Images and Text#

examples/tasks/page-words.ts#

import * as mupdf from "mupdf"

export type MyWord = {
	rect: mupdf.Rect,
	text: string,
	font: mupdf.Font,
	size: number,
};

export function getPageWords(page: mupdf.PDFPage): MyWord[] {
	const words: MyWord[] = []
	let cwordRect: mupdf.Rect | undefined
	let cwordFont: mupdf.Font | undefined
	let cwordSize: number | undefined
	let cwordText = ""

	const endWord = () => {
		// if word is complete, append to list
		if (cwordRect !== undefined && cwordFont !== undefined && cwordSize !== undefined && cwordText !== "") {
			words.push({
				rect: cwordRect,
				text: cwordText,
				font: cwordFont,
				size: cwordSize,
			})
		}

		// Reset values
		cwordRect = undefined
		cwordFont = undefined
		cwordSize = undefined
		cwordText = ""
	}

	const enlargeRect = (quad: mupdf.Quad) => {
		if (cwordRect === undefined) {
			cwordRect = [ quad[0], quad[1], quad[6], quad[7] ]
			return
		}

		cwordRect[0] = Math.min(cwordRect[0], quad[0])
		cwordRect[1] = Math.min(cwordRect[1], quad[1])
		cwordRect[2] = Math.max(cwordRect[2], quad[6])
		cwordRect[3] = Math.max(cwordRect[3], quad[7])
	}

	// extract the words from the page
	page.toStructuredText("preserve-whitespace,preserve-spans").walk({
		onChar(c, _origin, font, size, quad) {
			enlargeRect(quad)

			cwordFont = font
			cwordSize = size

			// split by whitespace
			if (c == " ") {
				endWord()
			} else {
				cwordText += c
			}
		},
		// split by block
		endLine: endWord,
		endTextBlock: endWord,
	})

	return words
}

export function getPageImages(page: mupdf.PDFPage): { bbox: mupdf.Rect; matrix: mupdf.Matrix; image: mupdf.Image }[] {
	var images: { bbox: mupdf.Rect; matrix: mupdf.Matrix; image: mupdf.Image }[] = []
	page.toStructuredText("preserve-images").walk({
		onImageBlock(bbox, matrix, image) {
			images.push({ bbox: bbox, matrix: matrix, image: image })
		},
	})
	return images
}

Managing Page Resources & XObjects#

examples/tasks/page-resources-xobject.ts#

import * as mupdf from "mupdf"

export function getPageResourcesXObjects(page: mupdf.PDFPage): { key: string | number; value: string }[] {
	let pageObj = page.getObject()
	var isIndirect = pageObj.isIndirect()

	if (isIndirect) {
		pageObj = pageObj.resolve()
	}

	let res = pageObj.get("Resources")
	let resXObj = res.get("XObject")
	let arr: { key: string | number; value: string }[] = []

	resXObj.forEach(function (value: mupdf.PDFObject, key: string | number) {
		arr.push({ key: key, value: value.toString() })
	})

	return arr
}

export function deletePageResourcesXObject(doc: mupdf.PDFDocument, page: mupdf.PDFPage, ref: string) {
	let pageObj = page.getObject()
	var isIndirect = pageObj.isIndirect()

	if (isIndirect) {
		pageObj = pageObj.resolve()
	}

	// replace the XObject with a 1x1 transparent pixel to "delete" it
	let res = pageObj.get("Resources")
	let resXObj = res.get("XObject")
	let pix = new mupdf.Pixmap(mupdf.ColorSpace.DeviceRGB, [ 0, 0, 1, 1 ], true)
	let imageRes = new mupdf.Image(pix)

	const image = doc.addImage(imageRes)
	resXObj.put(ref, image)

	res.put("XObject", resXObj)
	pageObj.put("Resources", res)
}