From 1024346b5f052d5a000cd1e07d0b500af856570f Mon Sep 17 00:00:00 2001 From: dankito Date: Sat, 30 Nov 2024 20:57:57 +0100 Subject: [PATCH] Implemented trying to read PdfInvoiceData from PDF attachments --- e-invoice-domain/build.gradle.kts | 7 +++ .../codinux/invoicing/email/EmailsFetcher.kt | 15 ++++- .../codinux/invoicing/email/model/Email.kt | 4 +- .../invoicing/email/model/EmailAttachment.kt | 6 +- .../codinux/invoicing/pdf/AmountOfMoney.kt | 11 ++++ .../invoicing/pdf/PdfBoxPdfTextExtractor.kt | 17 ++++++ .../codinux/invoicing/pdf/PdfInvoiceData.kt | 33 +++++++++++ .../pdf/PdfInvoiceDataExtractionResult.kt | 10 ++++ .../invoicing/pdf/PdfInvoiceDataExtractor.kt | 58 +++++++++++++++++++ .../codinux/invoicing/pdf/PdfTextExtractor.kt | 9 +++ .../invoicing/pdf/PdfTextExtractorResult.kt | 10 ++++ gradle.properties | 5 ++ 12 files changed, 182 insertions(+), 3 deletions(-) create mode 100644 e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/AmountOfMoney.kt create mode 100644 e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfBoxPdfTextExtractor.kt create mode 100644 e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfInvoiceData.kt create mode 100644 e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfInvoiceDataExtractionResult.kt create mode 100644 e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfInvoiceDataExtractor.kt create mode 100644 e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfTextExtractor.kt create mode 100644 e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfTextExtractorResult.kt diff --git a/e-invoice-domain/build.gradle.kts b/e-invoice-domain/build.gradle.kts index 08c2a2d..efd27d4 100644 --- a/e-invoice-domain/build.gradle.kts +++ b/e-invoice-domain/build.gradle.kts @@ -16,6 +16,9 @@ val kotlinCoroutinesVersion: String by project val mustangVersion: String by project +val textInfoExtractor: String by project +val pdfboxTextExtractor: String by project + val angusMailVersion: String by project val klfVersion: String by project @@ -30,6 +33,10 @@ dependencies { implementation("org.mustangproject:library:$mustangVersion") implementation("org.mustangproject:validator:$mustangVersion") + // pdf invoice data extraction + api("net.dankito.text.extraction:text-info-extractor:$textInfoExtractor") + api("net.dankito.text.extraction:pdfbox-text-extractor:$pdfboxTextExtractor") + implementation("org.eclipse.angus:angus-mail:$angusMailVersion") implementation("net.codinux.log:klf:$klfVersion") diff --git a/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/email/EmailsFetcher.kt b/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/email/EmailsFetcher.kt index 25c9ce0..50a73a9 100644 --- a/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/email/EmailsFetcher.kt +++ b/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/email/EmailsFetcher.kt @@ -9,6 +9,8 @@ import kotlinx.coroutines.* import net.codinux.invoicing.email.model.* import net.codinux.invoicing.filesystem.FileUtil import net.codinux.invoicing.model.Invoice +import net.codinux.invoicing.pdf.PdfInvoiceData +import net.codinux.invoicing.pdf.PdfInvoiceDataExtractor import net.codinux.invoicing.reader.EInvoiceReader import net.codinux.log.logger import org.eclipse.angus.mail.imap.IMAPFolder @@ -21,6 +23,7 @@ import kotlin.math.max open class EmailsFetcher( protected open val eInvoiceReader: EInvoiceReader = EInvoiceReader(), + protected open val pdfInvoiceDataExtractor: PdfInvoiceDataExtractor = PdfInvoiceDataExtractor(), protected open val coroutineDispatcher: CoroutineDispatcher = Executors.newFixedThreadPool(max(24, Runtime.getRuntime().availableProcessors() * 4)).asCoroutineDispatcher() ) { @@ -201,12 +204,14 @@ open class EmailsFetcher( val (invoice, invoiceFile) = tryToReadEInvoice(part, extension, messagePart.mediaType, status) + val pdfInvoiceData: PdfInvoiceData? = tryToReadInvoiceDataFromPdf(extension, messagePart.mediaType, invoiceFile) + if (invoice != null || Part.ATTACHMENT.equals(part.disposition, ignoreCase = true)) { val file = invoiceFile ?: if (extension !in status.options.downloadAttachmentsWithExtensions) null else downloadAttachment(part, status) - return EmailAttachment(part.fileName, extension, part.size.takeIf { it > 0 }, mapDisposition(part), messagePart.mediaType, part.contentType, invoice, file) + return EmailAttachment(part.fileName, extension, part.size.takeIf { it > 0 }, mapDisposition(part), messagePart.mediaType, part.contentType, invoice, pdfInvoiceData, file) } } catch (e: Throwable) { log.error(e) { "Could not check attachment '${messagePart.part.fileName}' (${messagePart.mediaType}) for eInvoice" } @@ -243,6 +248,14 @@ open class EmailsFetcher( } } + private fun tryToReadInvoiceDataFromPdf(extension: String, mediaType: String, invoiceFile: File?): PdfInvoiceData? = + // if it's a PDF than () already downloaded invoiceFile, so it must be non null then + if (invoiceFile != null && (extension == "pdf" || mediaType == "application/pdf" || mediaType == "application/octet-stream")) { + pdfInvoiceDataExtractor.tryToExtractInvoiceData(invoiceFile).data // TODO: pass result.error to status.onError() + } else { + null + } + protected open fun getAllMessageParts(part: Part): List { return if (part.isMimeType("multipart/*")) { diff --git a/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/email/model/Email.kt b/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/email/model/Email.kt index a173e5f..b7f6094 100644 --- a/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/email/model/Email.kt +++ b/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/email/model/Email.kt @@ -36,9 +36,11 @@ class Email( val hasAttachments: Boolean by lazy { attachments.isNotEmpty() } + val hasPdfAttachment: Boolean by lazy { attachments.any { it.isPdfFile } } + val hasEInvoiceAttachment: Boolean by lazy { attachments.any { it.containsEInvoice } } - val hasPdfAttachment: Boolean by lazy { attachments.any { it.isPdfFile } } + val hasAttachmentsWithExtractedInvoiceData: Boolean by lazy { attachments.any { it.couldExtractPdfInvoiceData } } override fun toString() = "${date.atZone(ZoneId.systemDefault()).toLocalDate()} $sender: $subject, ${attachments.size} attachment(s)" diff --git a/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/email/model/EmailAttachment.kt b/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/email/model/EmailAttachment.kt index b4a77e8..86f5f1a 100644 --- a/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/email/model/EmailAttachment.kt +++ b/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/email/model/EmailAttachment.kt @@ -1,6 +1,7 @@ package net.codinux.invoicing.email.model import net.codinux.invoicing.model.Invoice +import net.codinux.invoicing.pdf.PdfInvoiceData import java.io.File class EmailAttachment( @@ -16,11 +17,14 @@ class EmailAttachment( val mediaType: String?, val contentType: String?, val invoice: Invoice? = null, + val pdfInvoiceData: PdfInvoiceData? = null, val file: File? = null ) { + val isPdfFile: Boolean by lazy { extension == "pdf" || mediaType == "application/pdf" } + val containsEInvoice: Boolean by lazy { invoice != null } - val isPdfFile: Boolean by lazy { extension == "pdf" || mediaType == "application/pdf" } + val couldExtractPdfInvoiceData: Boolean by lazy { pdfInvoiceData != null } override fun toString() = "$filename: $invoice" } \ No newline at end of file diff --git a/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/AmountOfMoney.kt b/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/AmountOfMoney.kt new file mode 100644 index 0000000..5013a85 --- /dev/null +++ b/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/AmountOfMoney.kt @@ -0,0 +1,11 @@ +package net.codinux.invoicing.pdf + +import java.math.BigDecimal + +class AmountOfMoney( + val amount: BigDecimal, + val currency: String, + val amountWithCurrency: String = "$amount $currency" +) { + override fun toString() = amountWithCurrency +} \ No newline at end of file diff --git a/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfBoxPdfTextExtractor.kt b/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfBoxPdfTextExtractor.kt new file mode 100644 index 0000000..fffa3b6 --- /dev/null +++ b/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfBoxPdfTextExtractor.kt @@ -0,0 +1,17 @@ +package net.codinux.invoicing.pdf + +import net.dankito.text.extraction.ITextExtractor +import net.dankito.text.extraction.pdf.PdfBoxPdfTextExtractor +import java.io.File + +open class PdfBoxPdfTextExtractor( + protected open val textExtractor: ITextExtractor = PdfBoxPdfTextExtractor() +) : PdfTextExtractor { + + override fun extractTextFromPdf(pdfFile: File): PdfTextExtractorResult { + val result = textExtractor.extractText(pdfFile) + + return PdfTextExtractorResult(result.text, result.error?.exception) + } + +} \ No newline at end of file diff --git a/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfInvoiceData.kt b/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfInvoiceData.kt new file mode 100644 index 0000000..61000a4 --- /dev/null +++ b/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfInvoiceData.kt @@ -0,0 +1,33 @@ +package net.codinux.invoicing.pdf + +import java.math.BigDecimal +import java.time.LocalDate + +/** + * PDFs contain only unstructured data, so it's way harder to get invoice data from PDFs then from structured XML eInvoice files. + * + * So we can only guess which is the total amount, which the net and vat amount, which the invoice date, ... + * + * Therefor this class' properties all contain 'possible' in their name to reflect this circumstance. + */ +class PdfInvoiceData( + val potentialTotalAmount: AmountOfMoney, + val potentialNetAmount: AmountOfMoney? = null, + val potentialValueAddedTax: AmountOfMoney? = null, + val potentialValueAddedTaxRate: BigDecimal? = null, + + val potentialIban: String? = null, + val potentialBic: String? = null, + + val foundAmounts: List = emptyList(), + val foundPercentages: List = emptyList(), + + val foundDates: List = emptyList(), + + val foundPotentialIbans: List = emptyList(), + val foundPotentialBics: List = emptyList(), + + val pdfText: String +) { + override fun toString() = "$potentialTotalAmount" +} \ No newline at end of file diff --git a/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfInvoiceDataExtractionResult.kt b/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfInvoiceDataExtractionResult.kt new file mode 100644 index 0000000..23fd49e --- /dev/null +++ b/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfInvoiceDataExtractionResult.kt @@ -0,0 +1,10 @@ +package net.codinux.invoicing.pdf + +class PdfInvoiceDataExtractionResult( + val error: Throwable?, + val data: PdfInvoiceData? +) { + override fun toString() = + if (data != null) "Success: $data" + else "Error: $error" +} \ No newline at end of file diff --git a/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfInvoiceDataExtractor.kt b/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfInvoiceDataExtractor.kt new file mode 100644 index 0000000..1ef13b1 --- /dev/null +++ b/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfInvoiceDataExtractor.kt @@ -0,0 +1,58 @@ +package net.codinux.invoicing.pdf + +import net.dankito.text.extraction.info.invoice.InvoiceDataExtractor +import net.dankito.text.extraction.info.model.InvoiceData +import java.io.File +import java.time.LocalDate + +/** + * PDFs contain only unstructured data, so it's way harder to get invoice data from PDFs then from structured XML eInvoice files. + * + * But for validation purposes or PDFs without attached eInvoice XML we also try to extract unstructured invoice data from PDFs. + */ +open class PdfInvoiceDataExtractor( + protected open val textExtractor: PdfTextExtractor = PdfBoxPdfTextExtractor(), + protected open val invoiceDataExtractor: InvoiceDataExtractor = InvoiceDataExtractor() +) { + + open fun tryToExtractInvoiceData(file: File): PdfInvoiceDataExtractionResult { + val textExtractionResult = extractTextFromPdf(file) + if (textExtractionResult.error != null || textExtractionResult.text == null) { + return PdfInvoiceDataExtractionResult(textExtractionResult.error, null) + } + + val pdfText = textExtractionResult.text + val result = invoiceDataExtractor.extractInvoiceData(pdfText) + + return if (result.error != null) { + PdfInvoiceDataExtractionResult(result.error, null) + } else if (result.potentialTotalAmount == null) { + PdfInvoiceDataExtractionResult(IllegalStateException("Could not find total amount of invoice in PDF $file"), null) + } else { + PdfInvoiceDataExtractionResult(null, mapInvoiceData(result, pdfText)) + } + } + + protected open fun extractTextFromPdf(file: File): PdfTextExtractorResult = + textExtractor.extractTextFromPdf(file) + + + protected open fun mapInvoiceData(result: InvoiceData, pdfText: String) = PdfInvoiceData( + mapAmount(result.potentialTotalAmount)!!, mapAmount(result.potentialNetAmount), + mapAmount(result.potentialValueAddedTax), result.potentialValueAddedTaxRate?.amount, + + result.potentialIban, result.potentialBic, + + result.allAmounts.mapNotNull { mapAmount(it) }, result.percentages.mapNotNull { mapAmount(it) }, + + result.dates.map { LocalDate.of(it.year, it.month, it.day) }, + + result.ibans.map { it.hit }, result.bics.map { it.hit }, + + pdfText + ) + + protected open fun mapAmount(amount: net.dankito.text.extraction.info.model.AmountOfMoney?) = + amount?.let { AmountOfMoney(it.amount, it.currency, it.amountWithCurrency) } + +} \ No newline at end of file diff --git a/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfTextExtractor.kt b/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfTextExtractor.kt new file mode 100644 index 0000000..925d4fa --- /dev/null +++ b/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfTextExtractor.kt @@ -0,0 +1,9 @@ +package net.codinux.invoicing.pdf + +import java.io.File + +interface PdfTextExtractor { + + fun extractTextFromPdf(pdfFile: File): PdfTextExtractorResult + +} \ No newline at end of file diff --git a/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfTextExtractorResult.kt b/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfTextExtractorResult.kt new file mode 100644 index 0000000..78e6cc1 --- /dev/null +++ b/e-invoice-domain/src/main/kotlin/net/codinux/invoicing/pdf/PdfTextExtractorResult.kt @@ -0,0 +1,10 @@ +package net.codinux.invoicing.pdf + +data class PdfTextExtractorResult( + val text: String?, + val error: Throwable? +) { + override fun toString() = + if (text != null) "Success: $text" + else "Error: $error" +} \ No newline at end of file diff --git a/gradle.properties b/gradle.properties index 17c535a..a8fbcdc 100644 --- a/gradle.properties +++ b/gradle.properties @@ -14,8 +14,13 @@ kotlinCoroutinesVersion=1.9.0 quarkusVersion=3.16.3 +# Mustang 2.14 pulls PDFBox 3.x on the classpath, which is incompatible with PDFBox 2.x used by pdfbox-text-extractor +# but Mustang version 2.13 and 2.12 is missing its dependencies in pom.xml mustangVersion=2.14.2 +textInfoExtractor=1.0.3 +pdfboxTextExtractor=0.6.1 + angusMailVersion=2.0.3 klfVersion=1.6.2