Implemented trying to read PdfInvoiceData from PDF attachments

This commit is contained in:
dankito 2024-11-30 20:57:57 +01:00
parent 1746346ceb
commit 1024346b5f
12 changed files with 182 additions and 3 deletions

View File

@ -16,6 +16,9 @@ val kotlinCoroutinesVersion: String by project
val mustangVersion: String by project
val textInfoExtractor: String by project
val pdfboxTextExtractor: String by project
val angusMailVersion: String by project
val klfVersion: String by project
@ -30,6 +33,10 @@ dependencies {
implementation("org.mustangproject:library:$mustangVersion")
implementation("org.mustangproject:validator:$mustangVersion")
// pdf invoice data extraction
api("net.dankito.text.extraction:text-info-extractor:$textInfoExtractor")
api("net.dankito.text.extraction:pdfbox-text-extractor:$pdfboxTextExtractor")
implementation("org.eclipse.angus:angus-mail:$angusMailVersion")
implementation("net.codinux.log:klf:$klfVersion")

View File

@ -9,6 +9,8 @@ import kotlinx.coroutines.*
import net.codinux.invoicing.email.model.*
import net.codinux.invoicing.filesystem.FileUtil
import net.codinux.invoicing.model.Invoice
import net.codinux.invoicing.pdf.PdfInvoiceData
import net.codinux.invoicing.pdf.PdfInvoiceDataExtractor
import net.codinux.invoicing.reader.EInvoiceReader
import net.codinux.log.logger
import org.eclipse.angus.mail.imap.IMAPFolder
@ -21,6 +23,7 @@ import kotlin.math.max
open class EmailsFetcher(
protected open val eInvoiceReader: EInvoiceReader = EInvoiceReader(),
protected open val pdfInvoiceDataExtractor: PdfInvoiceDataExtractor = PdfInvoiceDataExtractor(),
protected open val coroutineDispatcher: CoroutineDispatcher = Executors.newFixedThreadPool(max(24, Runtime.getRuntime().availableProcessors() * 4)).asCoroutineDispatcher()
) {
@ -201,12 +204,14 @@ open class EmailsFetcher(
val (invoice, invoiceFile) = tryToReadEInvoice(part, extension, messagePart.mediaType, status)
val pdfInvoiceData: PdfInvoiceData? = tryToReadInvoiceDataFromPdf(extension, messagePart.mediaType, invoiceFile)
if (invoice != null || Part.ATTACHMENT.equals(part.disposition, ignoreCase = true)) {
val file = invoiceFile ?:
if (extension !in status.options.downloadAttachmentsWithExtensions) null
else downloadAttachment(part, status)
return EmailAttachment(part.fileName, extension, part.size.takeIf { it > 0 }, mapDisposition(part), messagePart.mediaType, part.contentType, invoice, file)
return EmailAttachment(part.fileName, extension, part.size.takeIf { it > 0 }, mapDisposition(part), messagePart.mediaType, part.contentType, invoice, pdfInvoiceData, file)
}
} catch (e: Throwable) {
log.error(e) { "Could not check attachment '${messagePart.part.fileName}' (${messagePart.mediaType}) for eInvoice" }
@ -243,6 +248,14 @@ open class EmailsFetcher(
}
}
private fun tryToReadInvoiceDataFromPdf(extension: String, mediaType: String, invoiceFile: File?): PdfInvoiceData? =
// if it's a PDF than () already downloaded invoiceFile, so it must be non null then
if (invoiceFile != null && (extension == "pdf" || mediaType == "application/pdf" || mediaType == "application/octet-stream")) {
pdfInvoiceDataExtractor.tryToExtractInvoiceData(invoiceFile).data // TODO: pass result.error to status.onError()
} else {
null
}
protected open fun getAllMessageParts(part: Part): List<MessagePart> {
return if (part.isMimeType("multipart/*")) {

View File

@ -36,9 +36,11 @@ class Email(
val hasAttachments: Boolean by lazy { attachments.isNotEmpty() }
val hasPdfAttachment: Boolean by lazy { attachments.any { it.isPdfFile } }
val hasEInvoiceAttachment: Boolean by lazy { attachments.any { it.containsEInvoice } }
val hasPdfAttachment: Boolean by lazy { attachments.any { it.isPdfFile } }
val hasAttachmentsWithExtractedInvoiceData: Boolean by lazy { attachments.any { it.couldExtractPdfInvoiceData } }
override fun toString() = "${date.atZone(ZoneId.systemDefault()).toLocalDate()} $sender: $subject, ${attachments.size} attachment(s)"

View File

@ -1,6 +1,7 @@
package net.codinux.invoicing.email.model
import net.codinux.invoicing.model.Invoice
import net.codinux.invoicing.pdf.PdfInvoiceData
import java.io.File
class EmailAttachment(
@ -16,11 +17,14 @@ class EmailAttachment(
val mediaType: String?,
val contentType: String?,
val invoice: Invoice? = null,
val pdfInvoiceData: PdfInvoiceData? = null,
val file: File? = null
) {
val isPdfFile: Boolean by lazy { extension == "pdf" || mediaType == "application/pdf" }
val containsEInvoice: Boolean by lazy { invoice != null }
val isPdfFile: Boolean by lazy { extension == "pdf" || mediaType == "application/pdf" }
val couldExtractPdfInvoiceData: Boolean by lazy { pdfInvoiceData != null }
override fun toString() = "$filename: $invoice"
}

View File

@ -0,0 +1,11 @@
package net.codinux.invoicing.pdf
import java.math.BigDecimal
class AmountOfMoney(
val amount: BigDecimal,
val currency: String,
val amountWithCurrency: String = "$amount $currency"
) {
override fun toString() = amountWithCurrency
}

View File

@ -0,0 +1,17 @@
package net.codinux.invoicing.pdf
import net.dankito.text.extraction.ITextExtractor
import net.dankito.text.extraction.pdf.PdfBoxPdfTextExtractor
import java.io.File
open class PdfBoxPdfTextExtractor(
protected open val textExtractor: ITextExtractor = PdfBoxPdfTextExtractor()
) : PdfTextExtractor {
override fun extractTextFromPdf(pdfFile: File): PdfTextExtractorResult {
val result = textExtractor.extractText(pdfFile)
return PdfTextExtractorResult(result.text, result.error?.exception)
}
}

View File

@ -0,0 +1,33 @@
package net.codinux.invoicing.pdf
import java.math.BigDecimal
import java.time.LocalDate
/**
* PDFs contain only unstructured data, so it's way harder to get invoice data from PDFs then from structured XML eInvoice files.
*
* So we can only guess which is the total amount, which the net and vat amount, which the invoice date, ...
*
* Therefor this class' properties all contain 'possible' in their name to reflect this circumstance.
*/
class PdfInvoiceData(
val potentialTotalAmount: AmountOfMoney,
val potentialNetAmount: AmountOfMoney? = null,
val potentialValueAddedTax: AmountOfMoney? = null,
val potentialValueAddedTaxRate: BigDecimal? = null,
val potentialIban: String? = null,
val potentialBic: String? = null,
val foundAmounts: List<AmountOfMoney> = emptyList(),
val foundPercentages: List<AmountOfMoney> = emptyList(),
val foundDates: List<LocalDate> = emptyList(),
val foundPotentialIbans: List<String> = emptyList(),
val foundPotentialBics: List<String> = emptyList(),
val pdfText: String
) {
override fun toString() = "$potentialTotalAmount"
}

View File

@ -0,0 +1,10 @@
package net.codinux.invoicing.pdf
class PdfInvoiceDataExtractionResult(
val error: Throwable?,
val data: PdfInvoiceData?
) {
override fun toString() =
if (data != null) "Success: $data"
else "Error: $error"
}

View File

@ -0,0 +1,58 @@
package net.codinux.invoicing.pdf
import net.dankito.text.extraction.info.invoice.InvoiceDataExtractor
import net.dankito.text.extraction.info.model.InvoiceData
import java.io.File
import java.time.LocalDate
/**
* PDFs contain only unstructured data, so it's way harder to get invoice data from PDFs then from structured XML eInvoice files.
*
* But for validation purposes or PDFs without attached eInvoice XML we also try to extract unstructured invoice data from PDFs.
*/
open class PdfInvoiceDataExtractor(
protected open val textExtractor: PdfTextExtractor = PdfBoxPdfTextExtractor(),
protected open val invoiceDataExtractor: InvoiceDataExtractor = InvoiceDataExtractor()
) {
open fun tryToExtractInvoiceData(file: File): PdfInvoiceDataExtractionResult {
val textExtractionResult = extractTextFromPdf(file)
if (textExtractionResult.error != null || textExtractionResult.text == null) {
return PdfInvoiceDataExtractionResult(textExtractionResult.error, null)
}
val pdfText = textExtractionResult.text
val result = invoiceDataExtractor.extractInvoiceData(pdfText)
return if (result.error != null) {
PdfInvoiceDataExtractionResult(result.error, null)
} else if (result.potentialTotalAmount == null) {
PdfInvoiceDataExtractionResult(IllegalStateException("Could not find total amount of invoice in PDF $file"), null)
} else {
PdfInvoiceDataExtractionResult(null, mapInvoiceData(result, pdfText))
}
}
protected open fun extractTextFromPdf(file: File): PdfTextExtractorResult =
textExtractor.extractTextFromPdf(file)
protected open fun mapInvoiceData(result: InvoiceData, pdfText: String) = PdfInvoiceData(
mapAmount(result.potentialTotalAmount)!!, mapAmount(result.potentialNetAmount),
mapAmount(result.potentialValueAddedTax), result.potentialValueAddedTaxRate?.amount,
result.potentialIban, result.potentialBic,
result.allAmounts.mapNotNull { mapAmount(it) }, result.percentages.mapNotNull { mapAmount(it) },
result.dates.map { LocalDate.of(it.year, it.month, it.day) },
result.ibans.map { it.hit }, result.bics.map { it.hit },
pdfText
)
protected open fun mapAmount(amount: net.dankito.text.extraction.info.model.AmountOfMoney?) =
amount?.let { AmountOfMoney(it.amount, it.currency, it.amountWithCurrency) }
}

View File

@ -0,0 +1,9 @@
package net.codinux.invoicing.pdf
import java.io.File
interface PdfTextExtractor {
fun extractTextFromPdf(pdfFile: File): PdfTextExtractorResult
}

View File

@ -0,0 +1,10 @@
package net.codinux.invoicing.pdf
data class PdfTextExtractorResult(
val text: String?,
val error: Throwable?
) {
override fun toString() =
if (text != null) "Success: $text"
else "Error: $error"
}

View File

@ -14,8 +14,13 @@ kotlinCoroutinesVersion=1.9.0
quarkusVersion=3.16.3
# Mustang 2.14 pulls PDFBox 3.x on the classpath, which is incompatible with PDFBox 2.x used by pdfbox-text-extractor
# but Mustang version 2.13 and 2.12 is missing its dependencies in pom.xml
mustangVersion=2.14.2
textInfoExtractor=1.0.3
pdfboxTextExtractor=0.6.1
angusMailVersion=2.0.3
klfVersion=1.6.2