Implemented trying to read PdfInvoiceData from PDF attachments
This commit is contained in:
parent
1746346ceb
commit
1024346b5f
|
@ -16,6 +16,9 @@ val kotlinCoroutinesVersion: String by project
|
|||
|
||||
val mustangVersion: String by project
|
||||
|
||||
val textInfoExtractor: String by project
|
||||
val pdfboxTextExtractor: String by project
|
||||
|
||||
val angusMailVersion: String by project
|
||||
|
||||
val klfVersion: String by project
|
||||
|
@ -30,6 +33,10 @@ dependencies {
|
|||
implementation("org.mustangproject:library:$mustangVersion")
|
||||
implementation("org.mustangproject:validator:$mustangVersion")
|
||||
|
||||
// pdf invoice data extraction
|
||||
api("net.dankito.text.extraction:text-info-extractor:$textInfoExtractor")
|
||||
api("net.dankito.text.extraction:pdfbox-text-extractor:$pdfboxTextExtractor")
|
||||
|
||||
implementation("org.eclipse.angus:angus-mail:$angusMailVersion")
|
||||
|
||||
implementation("net.codinux.log:klf:$klfVersion")
|
||||
|
|
|
@ -9,6 +9,8 @@ import kotlinx.coroutines.*
|
|||
import net.codinux.invoicing.email.model.*
|
||||
import net.codinux.invoicing.filesystem.FileUtil
|
||||
import net.codinux.invoicing.model.Invoice
|
||||
import net.codinux.invoicing.pdf.PdfInvoiceData
|
||||
import net.codinux.invoicing.pdf.PdfInvoiceDataExtractor
|
||||
import net.codinux.invoicing.reader.EInvoiceReader
|
||||
import net.codinux.log.logger
|
||||
import org.eclipse.angus.mail.imap.IMAPFolder
|
||||
|
@ -21,6 +23,7 @@ import kotlin.math.max
|
|||
|
||||
open class EmailsFetcher(
|
||||
protected open val eInvoiceReader: EInvoiceReader = EInvoiceReader(),
|
||||
protected open val pdfInvoiceDataExtractor: PdfInvoiceDataExtractor = PdfInvoiceDataExtractor(),
|
||||
protected open val coroutineDispatcher: CoroutineDispatcher = Executors.newFixedThreadPool(max(24, Runtime.getRuntime().availableProcessors() * 4)).asCoroutineDispatcher()
|
||||
) {
|
||||
|
||||
|
@ -201,12 +204,14 @@ open class EmailsFetcher(
|
|||
|
||||
val (invoice, invoiceFile) = tryToReadEInvoice(part, extension, messagePart.mediaType, status)
|
||||
|
||||
val pdfInvoiceData: PdfInvoiceData? = tryToReadInvoiceDataFromPdf(extension, messagePart.mediaType, invoiceFile)
|
||||
|
||||
if (invoice != null || Part.ATTACHMENT.equals(part.disposition, ignoreCase = true)) {
|
||||
val file = invoiceFile ?:
|
||||
if (extension !in status.options.downloadAttachmentsWithExtensions) null
|
||||
else downloadAttachment(part, status)
|
||||
|
||||
return EmailAttachment(part.fileName, extension, part.size.takeIf { it > 0 }, mapDisposition(part), messagePart.mediaType, part.contentType, invoice, file)
|
||||
return EmailAttachment(part.fileName, extension, part.size.takeIf { it > 0 }, mapDisposition(part), messagePart.mediaType, part.contentType, invoice, pdfInvoiceData, file)
|
||||
}
|
||||
} catch (e: Throwable) {
|
||||
log.error(e) { "Could not check attachment '${messagePart.part.fileName}' (${messagePart.mediaType}) for eInvoice" }
|
||||
|
@ -243,6 +248,14 @@ open class EmailsFetcher(
|
|||
}
|
||||
}
|
||||
|
||||
private fun tryToReadInvoiceDataFromPdf(extension: String, mediaType: String, invoiceFile: File?): PdfInvoiceData? =
|
||||
// if it's a PDF than () already downloaded invoiceFile, so it must be non null then
|
||||
if (invoiceFile != null && (extension == "pdf" || mediaType == "application/pdf" || mediaType == "application/octet-stream")) {
|
||||
pdfInvoiceDataExtractor.tryToExtractInvoiceData(invoiceFile).data // TODO: pass result.error to status.onError()
|
||||
} else {
|
||||
null
|
||||
}
|
||||
|
||||
|
||||
protected open fun getAllMessageParts(part: Part): List<MessagePart> {
|
||||
return if (part.isMimeType("multipart/*")) {
|
||||
|
|
|
@ -36,9 +36,11 @@ class Email(
|
|||
|
||||
val hasAttachments: Boolean by lazy { attachments.isNotEmpty() }
|
||||
|
||||
val hasPdfAttachment: Boolean by lazy { attachments.any { it.isPdfFile } }
|
||||
|
||||
val hasEInvoiceAttachment: Boolean by lazy { attachments.any { it.containsEInvoice } }
|
||||
|
||||
val hasPdfAttachment: Boolean by lazy { attachments.any { it.isPdfFile } }
|
||||
val hasAttachmentsWithExtractedInvoiceData: Boolean by lazy { attachments.any { it.couldExtractPdfInvoiceData } }
|
||||
|
||||
|
||||
override fun toString() = "${date.atZone(ZoneId.systemDefault()).toLocalDate()} $sender: $subject, ${attachments.size} attachment(s)"
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package net.codinux.invoicing.email.model
|
||||
|
||||
import net.codinux.invoicing.model.Invoice
|
||||
import net.codinux.invoicing.pdf.PdfInvoiceData
|
||||
import java.io.File
|
||||
|
||||
class EmailAttachment(
|
||||
|
@ -16,11 +17,14 @@ class EmailAttachment(
|
|||
val mediaType: String?,
|
||||
val contentType: String?,
|
||||
val invoice: Invoice? = null,
|
||||
val pdfInvoiceData: PdfInvoiceData? = null,
|
||||
val file: File? = null
|
||||
) {
|
||||
val isPdfFile: Boolean by lazy { extension == "pdf" || mediaType == "application/pdf" }
|
||||
|
||||
val containsEInvoice: Boolean by lazy { invoice != null }
|
||||
|
||||
val isPdfFile: Boolean by lazy { extension == "pdf" || mediaType == "application/pdf" }
|
||||
val couldExtractPdfInvoiceData: Boolean by lazy { pdfInvoiceData != null }
|
||||
|
||||
override fun toString() = "$filename: $invoice"
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
package net.codinux.invoicing.pdf
|
||||
|
||||
import java.math.BigDecimal
|
||||
|
||||
class AmountOfMoney(
|
||||
val amount: BigDecimal,
|
||||
val currency: String,
|
||||
val amountWithCurrency: String = "$amount $currency"
|
||||
) {
|
||||
override fun toString() = amountWithCurrency
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
package net.codinux.invoicing.pdf
|
||||
|
||||
import net.dankito.text.extraction.ITextExtractor
|
||||
import net.dankito.text.extraction.pdf.PdfBoxPdfTextExtractor
|
||||
import java.io.File
|
||||
|
||||
open class PdfBoxPdfTextExtractor(
|
||||
protected open val textExtractor: ITextExtractor = PdfBoxPdfTextExtractor()
|
||||
) : PdfTextExtractor {
|
||||
|
||||
override fun extractTextFromPdf(pdfFile: File): PdfTextExtractorResult {
|
||||
val result = textExtractor.extractText(pdfFile)
|
||||
|
||||
return PdfTextExtractorResult(result.text, result.error?.exception)
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
package net.codinux.invoicing.pdf
|
||||
|
||||
import java.math.BigDecimal
|
||||
import java.time.LocalDate
|
||||
|
||||
/**
|
||||
* PDFs contain only unstructured data, so it's way harder to get invoice data from PDFs then from structured XML eInvoice files.
|
||||
*
|
||||
* So we can only guess which is the total amount, which the net and vat amount, which the invoice date, ...
|
||||
*
|
||||
* Therefor this class' properties all contain 'possible' in their name to reflect this circumstance.
|
||||
*/
|
||||
class PdfInvoiceData(
|
||||
val potentialTotalAmount: AmountOfMoney,
|
||||
val potentialNetAmount: AmountOfMoney? = null,
|
||||
val potentialValueAddedTax: AmountOfMoney? = null,
|
||||
val potentialValueAddedTaxRate: BigDecimal? = null,
|
||||
|
||||
val potentialIban: String? = null,
|
||||
val potentialBic: String? = null,
|
||||
|
||||
val foundAmounts: List<AmountOfMoney> = emptyList(),
|
||||
val foundPercentages: List<AmountOfMoney> = emptyList(),
|
||||
|
||||
val foundDates: List<LocalDate> = emptyList(),
|
||||
|
||||
val foundPotentialIbans: List<String> = emptyList(),
|
||||
val foundPotentialBics: List<String> = emptyList(),
|
||||
|
||||
val pdfText: String
|
||||
) {
|
||||
override fun toString() = "$potentialTotalAmount"
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
package net.codinux.invoicing.pdf
|
||||
|
||||
class PdfInvoiceDataExtractionResult(
|
||||
val error: Throwable?,
|
||||
val data: PdfInvoiceData?
|
||||
) {
|
||||
override fun toString() =
|
||||
if (data != null) "Success: $data"
|
||||
else "Error: $error"
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
package net.codinux.invoicing.pdf
|
||||
|
||||
import net.dankito.text.extraction.info.invoice.InvoiceDataExtractor
|
||||
import net.dankito.text.extraction.info.model.InvoiceData
|
||||
import java.io.File
|
||||
import java.time.LocalDate
|
||||
|
||||
/**
|
||||
* PDFs contain only unstructured data, so it's way harder to get invoice data from PDFs then from structured XML eInvoice files.
|
||||
*
|
||||
* But for validation purposes or PDFs without attached eInvoice XML we also try to extract unstructured invoice data from PDFs.
|
||||
*/
|
||||
open class PdfInvoiceDataExtractor(
|
||||
protected open val textExtractor: PdfTextExtractor = PdfBoxPdfTextExtractor(),
|
||||
protected open val invoiceDataExtractor: InvoiceDataExtractor = InvoiceDataExtractor()
|
||||
) {
|
||||
|
||||
open fun tryToExtractInvoiceData(file: File): PdfInvoiceDataExtractionResult {
|
||||
val textExtractionResult = extractTextFromPdf(file)
|
||||
if (textExtractionResult.error != null || textExtractionResult.text == null) {
|
||||
return PdfInvoiceDataExtractionResult(textExtractionResult.error, null)
|
||||
}
|
||||
|
||||
val pdfText = textExtractionResult.text
|
||||
val result = invoiceDataExtractor.extractInvoiceData(pdfText)
|
||||
|
||||
return if (result.error != null) {
|
||||
PdfInvoiceDataExtractionResult(result.error, null)
|
||||
} else if (result.potentialTotalAmount == null) {
|
||||
PdfInvoiceDataExtractionResult(IllegalStateException("Could not find total amount of invoice in PDF $file"), null)
|
||||
} else {
|
||||
PdfInvoiceDataExtractionResult(null, mapInvoiceData(result, pdfText))
|
||||
}
|
||||
}
|
||||
|
||||
protected open fun extractTextFromPdf(file: File): PdfTextExtractorResult =
|
||||
textExtractor.extractTextFromPdf(file)
|
||||
|
||||
|
||||
protected open fun mapInvoiceData(result: InvoiceData, pdfText: String) = PdfInvoiceData(
|
||||
mapAmount(result.potentialTotalAmount)!!, mapAmount(result.potentialNetAmount),
|
||||
mapAmount(result.potentialValueAddedTax), result.potentialValueAddedTaxRate?.amount,
|
||||
|
||||
result.potentialIban, result.potentialBic,
|
||||
|
||||
result.allAmounts.mapNotNull { mapAmount(it) }, result.percentages.mapNotNull { mapAmount(it) },
|
||||
|
||||
result.dates.map { LocalDate.of(it.year, it.month, it.day) },
|
||||
|
||||
result.ibans.map { it.hit }, result.bics.map { it.hit },
|
||||
|
||||
pdfText
|
||||
)
|
||||
|
||||
protected open fun mapAmount(amount: net.dankito.text.extraction.info.model.AmountOfMoney?) =
|
||||
amount?.let { AmountOfMoney(it.amount, it.currency, it.amountWithCurrency) }
|
||||
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
package net.codinux.invoicing.pdf
|
||||
|
||||
import java.io.File
|
||||
|
||||
interface PdfTextExtractor {
|
||||
|
||||
fun extractTextFromPdf(pdfFile: File): PdfTextExtractorResult
|
||||
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
package net.codinux.invoicing.pdf
|
||||
|
||||
data class PdfTextExtractorResult(
|
||||
val text: String?,
|
||||
val error: Throwable?
|
||||
) {
|
||||
override fun toString() =
|
||||
if (text != null) "Success: $text"
|
||||
else "Error: $error"
|
||||
}
|
|
@ -14,8 +14,13 @@ kotlinCoroutinesVersion=1.9.0
|
|||
quarkusVersion=3.16.3
|
||||
|
||||
|
||||
# Mustang 2.14 pulls PDFBox 3.x on the classpath, which is incompatible with PDFBox 2.x used by pdfbox-text-extractor
|
||||
# but Mustang version 2.13 and 2.12 is missing its dependencies in pom.xml
|
||||
mustangVersion=2.14.2
|
||||
|
||||
textInfoExtractor=1.0.3
|
||||
pdfboxTextExtractor=0.6.1
|
||||
|
||||
angusMailVersion=2.0.3
|
||||
|
||||
klfVersion=1.6.2
|
||||
|
|
Loading…
Reference in New Issue