diff --git a/.env.tpl b/.env.tpl index 9e067f0997d11d92cc554711b601c8ada3d71cb8..07db6275b20656705f703908320fa2039231ac4c 100644 --- a/.env.tpl +++ b/.env.tpl @@ -168,3 +168,11 @@ NOPAQUE_SOCKETIO_MESSAGE_QUEUE_URI= # DEFAULT: 0 # Number of values to trust for X-Forwarded-Proto # NOPAQUE_PROXY_FIX_X_PROTO= + +# CHOOSE ONE: False, True +# DEFAULT: False +# NOPAQUE_TRANSKRIBUS_ENABLED= + +# READ-COOP account data: https://readcoop.eu/ +# NOPAQUE_READCOOP_USERNAME= +# NOPAQUE_READCOOP_PASSWORD= \ No newline at end of file diff --git a/.gitignore b/.gitignore index 61a99e04add2d7c91440488cb60e72c0aab6c906..76c4e06b7c77cae22e913df43258e4ce16e6568e 100644 --- a/.gitignore +++ b/.gitignore @@ -18,7 +18,8 @@ data/** pip-log.txt # Logs in log folder -logs/*.log +logs/* +!logs/dummy # Packages *.egg diff --git a/Dockerfile b/Dockerfile index c327703ff2cbe3d39e91598cbe48152fed5fd12c..6ec450b5d0cccdea5b364151c1c5f3f4adb68794 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ -FROM python:3.9.8-slim-buster +FROM python:3.8.13-slim-buster -LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>, Stephan Porada <sporada@uni-bielefeld.de>" +LABEL authors="Patrick Jentsch <p.jentsch@uni-bielefeld.de>" ARG DOCKER_GID diff --git a/app/TesseractOCRModel.defaults.yml b/app/TesseractOCRModel.defaults.yml index 37929e89d70c609a834f72aaae1f0ec0190419ca..0d067d54ac35193249dcff1e2a085b6045e856e5 100644 --- a/app/TesseractOCRModel.defaults.yml +++ b/app/TesseractOCRModel.defaults.yml @@ -6,6 +6,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Amharic' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/amh.traineddata' @@ -14,6 +18,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' - title: 'Arabic' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ara.traineddata' @@ -22,6 +30,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' # - title: 'Assamese' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/asm.traineddata' @@ -30,6 +42,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Azerbaijani' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze.traineddata' @@ -38,6 +54,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Azerbaijani - Cyrillic' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/aze_cyrl.traineddata' @@ -46,6 +66,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Belarusian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bel.traineddata' @@ -54,6 +78,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Bengali' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ben.traineddata' @@ -62,6 +90,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Tibetan' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bod.traineddata' @@ -70,6 +102,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Bosnian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bos.traineddata' @@ -78,6 +114,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Bulgarian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/bul.traineddata' @@ -86,6 +126,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Catalan; Valencian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cat.traineddata' @@ -94,6 +138,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Cebuano' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ceb.traineddata' @@ -102,6 +150,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Czech' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ces.traineddata' @@ -110,6 +162,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Chinese - Simplified' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_sim.traineddata' @@ -118,6 +174,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' - title: 'Chinese - Traditional' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chi_tra.traineddata' @@ -126,6 +186,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' # - title: 'Cherokee' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/chr.traineddata' @@ -134,6 +198,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Welsh' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/cym.traineddata' @@ -142,6 +210,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' - title: 'Danish' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dan.traineddata' @@ -150,6 +222,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' - title: 'German' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/deu.traineddata' @@ -158,6 +234,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' # - title: 'Dzongkha' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/dzo.traineddata' @@ -166,6 +246,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' - title: 'Greek, Modern (1453-)' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ell.traineddata' @@ -174,6 +258,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' - title: 'English' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eng.traineddata' @@ -182,6 +270,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' - title: 'English, Middle (1100-1500)' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/enm.traineddata' @@ -190,6 +282,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' # - title: 'Esperanto' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/epo.traineddata' @@ -198,6 +294,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Estonian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/est.traineddata' @@ -206,6 +306,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Basque' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/eus.traineddata' @@ -214,6 +318,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Persian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fas.traineddata' @@ -222,6 +330,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Finnish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fin.traineddata' @@ -230,6 +342,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' - title: 'French' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/fra.traineddata' @@ -238,6 +354,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' - title: 'German Fraktur' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frk.traineddata' @@ -246,6 +366,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' - title: 'French, Middle (ca. 1400-1600)' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/frm.traineddata' @@ -254,6 +378,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' # - title: 'Irish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/gle.traineddata' @@ -262,6 +390,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Galician' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/glg.traineddata' @@ -270,6 +402,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' - title: 'Greek, Ancient (-1453)' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/grc.traineddata' @@ -278,6 +414,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' # - title: 'Gujarati' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/guj.traineddata' @@ -286,6 +426,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Haitian; Haitian Creole' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hat.traineddata' @@ -294,6 +438,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Hebrew' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/heb.traineddata' @@ -302,6 +450,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Hindi' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hin.traineddata' @@ -310,6 +462,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Croatian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hrv.traineddata' @@ -318,6 +474,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Hungarian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/hun.traineddata' @@ -326,6 +486,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Inuktitut' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/iku.traineddata' @@ -334,6 +498,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Indonesian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ind.traineddata' @@ -342,6 +510,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Icelandic' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/isl.traineddata' @@ -350,6 +522,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' - title: 'Italian' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita.traineddata' @@ -358,6 +534,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' - title: 'Italian - Old' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ita_old.traineddata' @@ -366,6 +546,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' # - title: 'Javanese' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jav.traineddata' @@ -374,6 +558,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Japanese' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/jpn.traineddata' @@ -382,6 +570,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Kannada' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kan.traineddata' @@ -390,6 +582,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Georgian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat.traineddata' @@ -398,6 +594,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Georgian - Old' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kat_old.traineddata' @@ -406,6 +606,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Kazakh' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kaz.traineddata' @@ -414,6 +618,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Central Khmer' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/khm.traineddata' @@ -422,6 +630,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Kirghiz; Kyrgyz' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kir.traineddata' @@ -430,6 +642,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Korean' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kor.traineddata' @@ -438,6 +654,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Kurdish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/kur.traineddata' @@ -446,6 +666,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Lao' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lao.traineddata' @@ -454,6 +678,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Latin' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lat.traineddata' @@ -462,6 +690,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Latvian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lav.traineddata' @@ -470,6 +702,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Lithuanian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/lit.traineddata' @@ -478,6 +714,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Malayalam' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mal.traineddata' @@ -486,6 +726,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Marathi' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mar.traineddata' @@ -494,6 +738,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Macedonian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mkd.traineddata' @@ -502,6 +750,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Maltese' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mlt.traineddata' @@ -510,6 +762,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Malay' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/msa.traineddata' @@ -518,6 +774,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Burmese' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/mya.traineddata' @@ -526,6 +786,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Nepali' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nep.traineddata' @@ -534,6 +798,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Dutch; Flemish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nld.traineddata' @@ -542,6 +810,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Norwegian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/nor.traineddata' @@ -550,6 +822,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Oriya' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ori.traineddata' @@ -558,6 +834,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Panjabi; Punjabi' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pan.traineddata' @@ -566,6 +846,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Polish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pol.traineddata' @@ -574,6 +858,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' - title: 'Portuguese' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/por.traineddata' @@ -582,6 +870,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' # - title: 'Pushto; Pashto' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/pus.traineddata' @@ -590,6 +882,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Romanian; Moldavian; Moldovan' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ron.traineddata' @@ -598,6 +894,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' - title: 'Russian' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/rus.traineddata' @@ -606,6 +906,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' # - title: 'Sanskrit' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/san.traineddata' @@ -614,6 +918,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Sinhala; Sinhalese' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sin.traineddata' @@ -622,6 +930,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Slovak' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slk.traineddata' @@ -630,6 +942,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Slovenian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/slv.traineddata' @@ -638,6 +954,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' - title: 'Spanish; Castilian' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa.traineddata' @@ -646,6 +966,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' - title: 'Spanish; Castilian - Old' description: '' url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/spa_old.traineddata' @@ -654,6 +978,10 @@ version: '4.1.0' compatible_service_versions: - '0.1.0' + - '0.1.1' + - '0.1.2' + - '0.1.3' + - '0.1.4' # - title: 'Albanian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/sqi.traineddata' @@ -662,6 +990,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Serbian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp.traineddata' @@ -670,6 +1002,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Serbian - Latin' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/srp_latn.traineddata' @@ -678,6 +1014,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Swahili' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swa.traineddata' @@ -686,6 +1026,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Swedish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/swe.traineddata' @@ -694,6 +1038,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Syriac' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/syr.traineddata' @@ -702,6 +1050,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Tamil' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tam.traineddata' @@ -710,6 +1062,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Telugu' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tel.traineddata' @@ -718,6 +1074,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Tajik' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgk.traineddata' @@ -726,6 +1086,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Tagalog' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tgl.traineddata' @@ -734,6 +1098,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Thai' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tha.traineddata' @@ -742,6 +1110,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Tigrinya' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tir.traineddata' @@ -750,6 +1122,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Turkish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/tur.traineddata' @@ -758,6 +1134,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Uighur; Uyghur' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uig.traineddata' @@ -766,6 +1146,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Ukrainian' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/ukr.traineddata' @@ -774,6 +1158,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Urdu' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/urd.traineddata' @@ -782,6 +1170,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Uzbek' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb.traineddata' @@ -790,6 +1182,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Uzbek - Cyrillic' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/uzb_cyrl.traineddata' @@ -798,6 +1194,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Vietnamese' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/vie.traineddata' @@ -806,6 +1206,10 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' # - title: 'Yiddish' # description: '' # url: 'https://github.com/tesseract-ocr/tessdata/raw/4.1.0/yid.traineddata' @@ -814,3 +1218,7 @@ # version: '4.1.0' # compatible_service_versions: # - '0.1.0' +# - '0.1.1' +# - '0.1.2' +# - '0.1.3' +# - '0.1.4' diff --git a/app/TranskribusHTRModel.defaults.yml b/app/TranskribusHTRModel.defaults.yml new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/app/__init__.py b/app/__init__.py index 46ca4533b8af7b9671bb0a7a663fa3d1f19104fc..9db97d3f2558fdc8e5f7f6347d568ccbd94810f5 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1,5 +1,6 @@ from config import Config from flask import Flask +from flask_apscheduler import APScheduler from flask_login import LoginManager from flask_mail import Mail from flask_migrate import Migrate @@ -20,6 +21,7 @@ mail: Mail = Mail() migrate: Migrate = Migrate() paranoid: Paranoid = Paranoid() paranoid.redirect_view = '/' +scheduler: APScheduler = APScheduler() # TODO: Use this! socketio: SocketIO = SocketIO() diff --git a/app/cli.py b/app/cli.py index e588eef97dbcebad5721ea21d99f6d06bcd68c8a..4bff77d3c89da2bd35f875831fdd930caf973849 100644 --- a/app/cli.py +++ b/app/cli.py @@ -2,9 +2,8 @@ from flask import current_app from flask_migrate import upgrade from . import db from .models import Corpus, Job, Role, User, TesseractOCRModel -import json +import click import os -import re def _make_default_dirs(): @@ -56,6 +55,19 @@ def register(app): daemon: Daemon = Daemon() daemon.run() + @app.cli.group() + def converter(): + ''' Converter commands. ''' + pass + + @converter.command() + @click.argument('json_db') + @click.argument('data_dir') + def sandpaper(json_db, data_dir): + ''' Sandpaper converter ''' + from app.converters.sandpaper import convert + convert(json_db, data_dir) + @app.cli.group() def test(): ''' Test commands. ''' @@ -68,55 +80,3 @@ def register(app): from unittest.suite import TestSuite tests: TestSuite = TestLoader().discover('tests') TextTestRunner(verbosity=2).run(tests) - - @app.cli.group() - def convert(): - ''' Datebase convert commands. ''' - - @convert.command() - def nlp_jobs(): - for job in Job.query.filter_by(service='nlp').all(): - job.service = 'spacy-nlp' - service_args = json.loads(job.service_args) - new_service_args = {} - for service_arg in service_args: - if service_arg == '--check-encoding': - new_service_args['encoding_detection'] = True - elif re.match(r'-l ([a-z]{2})', service_arg): - language_code = re.search(r'-l ([a-z]{2})', service_arg).group(1) # noqa - new_service_args['language'] = language_code - job.service_args = json.dumps(new_service_args) - db.session.commit() - - @convert.command() - def ocr_jobs(): - # Language code to TesseractOCRModel.title lookup - language_code_lookup = { - 'ara': 'Arabic', - 'chi_tra': 'Chinese - Traditional', - 'dan': 'Danish', - 'eng': 'English', - 'enm': 'English, Middle (1100-1500)', - 'fra': 'French', - 'frm': 'French, Middle (ca. 1400-1600)', - 'deu': 'German', - 'frk': 'German Fraktur', - 'ell': 'Greek, Modern (1453-)', - 'ita': 'Italian', - 'por': 'Portuguese', - 'rus': 'Russian', - 'spa': 'Spanish; Castilian' - } - for job in Job.query.filter_by(service='ocr').all(): - job.service = 'tesseract-ocr' - service_args = json.loads(job.service_args) - new_service_args = {} - for service_arg in service_args: - if service_arg == '--binarize': - new_service_args['binarization'] = True - elif re.match(r'-l ([a-z]{3})', service_arg): - language_code = re.search(r'-l ([a-z]{3})', service_arg).group(1) # noqa - tesseract_ocr_model = TesseractOCRModel.query.filter_by(title=language_code_lookup[language_code]).first() # noqa - new_service_args['model'] = tesseract_ocr_model.id - job.service_args = json.dumps(new_service_args) - db.session.commit() diff --git a/app/converters/__init__.py b/app/converters/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/app/converters/sandpaper.py b/app/converters/sandpaper.py new file mode 100644 index 0000000000000000000000000000000000000000..3172183e7dd2cd84edc46b0357b54fa9f2303c52 --- /dev/null +++ b/app/converters/sandpaper.py @@ -0,0 +1,215 @@ +from flask import current_app +from app import db +from app.models import User, Corpus, CorpusFile +from datetime import datetime +import json +import os + + +def convert(json_db_file, data_dir): + with open(json_db_file, 'r') as f: + json_db = json.loads(f.read()) + + for json_user in json_db: + if not json_user['confirmed']: + current_app.logger.info(f'Skip unconfirmed user {json_user["username"]}') + continue + user_dir = os.path.join(data_dir, json_user['id']) + convert_user(json_user, user_dir) + db.session.commit() + + +def convert_user(json_user, user_dir): + current_app.logger.info(f'Create User {json_user["username"]}...') + user = User( + confirmed=json_user['confirmed'], + email=json_user['email'], + last_seen=datetime.fromtimestamp(json_user['last_seen']), + member_since=datetime.fromtimestamp(json_user['member_since']), + password_hash=json_user['password_hash'], # TODO: Needs to be added manually + username=json_user['username'] + ) + db.session.add(user) + db.session.flush(objects=[user]) + db.session.refresh(user) + try: + user.makedirs() + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + raise Exception('Internal Server Error') + for json_corpus in json_user['corpora'].values(): + if not json_corpus['files'].values(): + current_app.logger.info(f'Skip empty corpus {json_corpus["title"]}') + continue + corpus_dir = os.path.join(user_dir, 'corpora', json_corpus['id']) + convert_corpus(json_corpus, user, corpus_dir) + current_app.logger.info('Done') + + +def convert_corpus(json_corpus, user, corpus_dir): + current_app.logger.info(f'Create Corpus {json_corpus["title"]}...') + corpus = Corpus( + user=user, + creation_date=datetime.fromtimestamp(json_corpus['creation_date']), + description=json_corpus['description'], + last_edited_date=datetime.fromtimestamp(json_corpus['last_edited_date']), + title=json_corpus['title'] + ) + db.session.add(corpus) + db.session.flush(objects=[corpus]) + db.session.refresh(corpus) + try: + corpus.makedirs() + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + raise Exception('Internal Server Error') + for json_corpus_file in json_corpus['files'].values(): + corpus_file_dir = os.path.join(corpus_dir, 'files', json_corpus_file['id']) + convert_corpus_file(json_corpus_file, corpus, corpus_file_dir) + current_app.logger.info('Done') + + +def convert_corpus_file(json_corpus_file, corpus, corpus_file_dir): + current_app.logger.info(f'Create CorpusFile {json_corpus_file["title"]}...') + corpus_file = CorpusFile( + corpus=corpus, + address=json_corpus_file['address'], + author=json_corpus_file['author'], + booktitle=json_corpus_file['booktitle'], + chapter=json_corpus_file['chapter'], + editor=json_corpus_file['editor'], + filename=json_corpus_file['filename'], + institution=json_corpus_file['institution'], + journal=json_corpus_file['journal'], + mimetype='application/vrt+xml', + pages=json_corpus_file['pages'], + publisher=json_corpus_file['publisher'], + publishing_year=json_corpus_file['publishing_year'], + school=json_corpus_file['school'], + title=json_corpus_file['title'] + ) + db.session.add(corpus_file) + db.session.flush(objects=[corpus_file]) + db.session.refresh(corpus_file) + try: + convert_vrt( + os.path.join(corpus_file_dir, json_corpus_file['filename']), + corpus_file.path + ) + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + raise Exception('Internal Server Error') + current_app.logger.info('Done') + + +def convert_vrt(input_file, output_file): + def check_pos_attribute_order(vrt_lines): + # The following orders are possible: + # since 26.02.2019: 'word,lemma,simple_pos,pos,ner' + # since 26.03.2021: 'word,pos,lemma,simple_pos,ner' + # since 27.01.2022: 'word,pos,lemma,simple_pos' + # This Function tries to find out which order we have by looking at the + # number of attributes and the position of the simple_pos attribute + SIMPLE_POS_LABELS = [ + 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', + 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', + 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', + 'VERB', 'X' + ] + for line in vrt_lines: + if line.startswith('<'): + continue + pos_attrs = line.rstrip('\n').split('\t') + num_pos_attrs = len(pos_attrs) + if num_pos_attrs == 4: + if pos_attrs[3] in SIMPLE_POS_LABELS: + return ['word', 'pos', 'lemma', 'simple_pos'] + continue + elif num_pos_attrs == 5: + if pos_attrs[2] in SIMPLE_POS_LABELS: + return ['word', 'lemma', 'simple_pos', 'pos', 'ner'] + elif pos_attrs[3] in SIMPLE_POS_LABELS: + return ['word', 'pos', 'lemma', 'simple_pos', 'ner'] + continue + return None + + + def check_has_ent_as_s_attr(vrt_lines): + for line in vrt_lines: + if line.startswith('<ent'): + return True + return False + + + def pos_attrs_to_string_1(pos_attrs): + return f'{pos_attrs[0]}\t{pos_attrs[3]}\t{pos_attrs[1]}\t{pos_attrs[2]}\n' + + + def pos_attrs_to_string_2(pos_attrs): + return f'{pos_attrs[0]}\t{pos_attrs[1]}\t{pos_attrs[2]}\t{pos_attrs[3]}\n' + + + with open(input_file) as f: + input_vrt_lines = f.readlines() + + pos_attr_order = check_pos_attribute_order(input_vrt_lines) + has_ent_as_s_attr = check_has_ent_as_s_attr(input_vrt_lines) + + print(f'Detected pos_attr_order: [{",".join(pos_attr_order)}]') + print(f'Detected has_ent_as_s_attr: {has_ent_as_s_attr}') + + if pos_attr_order == ['word', 'lemma', 'simple_pos', 'pos', 'ner']: + pos_attrs_to_string_function = pos_attrs_to_string_1 + elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos', 'ner']: + pos_attrs_to_string_function = pos_attrs_to_string_2 + elif pos_attr_order == ['word', 'pos', 'lemma', 'simple_pos']: + pos_attrs_to_string_function = pos_attrs_to_string_2 + else: + raise Exception('Can not handle format') + + current_ent = None + output_vrt = '' + for line in input_vrt_lines: + if line.strip() == '': + continue + if line.startswith('<'): + if not has_ent_as_s_attr: + if current_ent is not None: + output_vrt += '</ent>\n' + current_ent = None + if ( + line.startswith('<corpus') + or line.startswith('</corpus') + or line.startswith('<nlp') + ): + continue + elif line.startswith('<text'): + output_vrt += '<text>\n' + continue + elif line.startswith('<s'): + output_vrt += '<s>\n' + continue + output_vrt += line + continue + pos_attrs = line.rstrip('\n').split('\t') + if not has_ent_as_s_attr: + if pos_attrs[4].lower() in ['null', 'none']: + if current_ent: + output_vrt += '</ent>\n' + current_ent = None + else: + if current_ent is None: + output_vrt += f'<ent type="{pos_attrs[4]}">\n' + current_ent = pos_attrs[4] + elif current_ent != pos_attrs[4]: + output_vrt += '</ent>\n' + current_ent = None + output_vrt += f'<ent type="{pos_attrs[4]}">\n' + current_ent = pos_attrs[4] + output_vrt += pos_attrs_to_string_function(pos_attrs) + + with open(output_file, 'w') as f: + f.write(output_vrt) diff --git a/app/corpora/routes.py b/app/corpora/routes.py index fd0085eec2ec69384a504a6944411e6e5ed21eab..3b334d83b0e7b67ef1fc57e00a8b75e066cf8d07 100644 --- a/app/corpora/routes.py +++ b/app/corpora/routes.py @@ -319,7 +319,7 @@ def corpus_file(corpus_id, corpus_file_id): form.title.data = corpus_file.title return render_template( 'corpora/corpus_file.html.j2', - corpus=corpus, + corpus=corpus_file.corpus, corpus_file=corpus_file, form=form, title='Edit corpus file' diff --git a/app/daemon/job_utils.py b/app/daemon/job_utils.py index e4797e264ac28de4d303011cac9e7359e3ffc33b..e56bafbc4e6b9ca03f9278649d63c019c5a0ce88 100644 --- a/app/daemon/job_utils.py +++ b/app/daemon/job_utils.py @@ -22,34 +22,46 @@ class CheckJobsMixin: def create_job_service(self, job): ''' # Docker service settings # ''' ''' ## Service specific settings ## ''' - if job.service == 'file-setup': + if job.service == 'file-setup-pipeline': mem_mb = 512 n_cores = 2 - executable = 'file-setup' - image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup:v{job.service_version}' # noqa - elif job.service == 'tesseract-ocr': - mem_mb = 2048 + executable = 'file-setup-pipeline' + image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}file-setup-pipeline:v{job.service_version}' # noqa + elif job.service == 'tesseract-ocr-pipeline': + mem_mb = 1024 + n_cores = 4 + executable = 'tesseract-ocr-pipeline' + image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}tesseract-ocr-pipeline:v{job.service_version}' # noqa + elif job.service == 'transkribus-htr-pipeline': + mem_mb = 1024 n_cores = 4 - executable = 'ocr' - image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}ocr:v{job.service_version}' # noqa - elif job.service == 'spacy-nlp': + executable = 'transkribus-htr-pipeline' + image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}transkribus-htr-pipeline:v{job.service_version}' # noqa + elif job.service == 'spacy-nlp-pipeline': mem_mb = 1024 n_cores = 1 - executable = 'nlp' - image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}nlp:v{job.service_version}' # noqa + executable = 'spacy-nlp-pipeline' + image = f'{current_app.config["NOPAQUE_DOCKER_IMAGE_PREFIX"]}spacy-nlp-pipeline:v{job.service_version}' # noqa ''' ## Command ## ''' command = f'{executable} -i /input -o /output' command += ' --log-dir /logs' command += f' --mem-mb {mem_mb}' command += f' --n-cores {n_cores}' - service_args = json.loads(job.service_args) - if job.service == 'spacy-nlp': - command += f' -m {service_args["model"]}' - if 'encoding_detection' in service_args and service_args['encoding_detection']: # noqa + if job.service == 'spacy-nlp-pipeline': + command += f' -m {job.service_args["model"]}' + if 'encoding_detection' in job.service_args and job.service_args['encoding_detection']: # noqa command += ' --check-encoding' - elif job.service == 'tesseract-ocr': - command += f' -m {service_args["model"]}' - if 'binarization' in service_args and service_args['binarization']: + elif job.service == 'tesseract-ocr-pipeline': + command += f' -m {job.service_args["model"]}' + if 'binarization' in job.service_args and job.service_args['binarization']: + command += ' --binarize' + elif job.service == 'transkribus-htr-pipeline': + command += f' -m {job.service_args["model"]}' + readcoop_username = current_app.config.get('NOPAQUE_READCOOP_USERNAME') + command += f' --readcoop-username "{readcoop_username}"' + readcoop_password = current_app.config.get('NOPAQUE_READCOOP_PASSWORD') + command += f' --readcoop-password "{readcoop_password}"' + if 'binarization' in job.service_args and job.service_args['binarization']: command += ' --binarize' ''' ## Constraints ## ''' constraints = ['node.role==worker'] @@ -63,16 +75,15 @@ class CheckJobsMixin: mounts = [] ''' ### Input mount(s) ### ''' input_mount_target_base = '/input' - if job.service == 'file-setup': + if job.service == 'file-setup-pipeline': input_mount_target_base += f'/{secure_filename(job.title)}' for job_input in job.inputs: input_mount_source = job_input.path - input_mount_target = f'/{input_mount_target_base}/{job_input.filename}' # noqa + input_mount_target = f'{input_mount_target_base}/{job_input.filename}' # noqa input_mount = f'{input_mount_source}:{input_mount_target}:ro' mounts.append(input_mount) - if job.service == 'tesseract-ocr': - service_args = json.loads(job.service_args) - model = TesseractOCRModel.query.get(service_args['model']) + if job.service == 'tesseract-ocr-pipeline': + model = TesseractOCRModel.query.get(job.service_args['model']) if model is None: job.status = JobStatus.FAILED return @@ -114,7 +125,8 @@ class CheckJobsMixin: mounts=mounts, name=name, resources=resources, - restart_policy=restart_policy + restart_policy=restart_policy, + user='1000:1000' ) except docker.errors.APIError as e: current_app.logger.error( diff --git a/app/models.py b/app/models.py index d391807a206d4054dd0ad0e2f2c9c5980c27b1fa..528b11dc94ca1fdb47c65dc1a06a109a47a9bd61 100644 --- a/app/models.py +++ b/app/models.py @@ -36,14 +36,23 @@ class IntEnumColumn(db.TypeDecorator): return self.enum_type(value) -class Permission(IntEnum): - ''' - Defines User permissions as integers by the power of 2. User permission - can be evaluated using the bitwise operator &. - ''' - ADMINISTRATE = 4 - CONTRIBUTE = 2 - USE_API = 1 +class ContainerColumn(db.TypeDecorator): + impl = db.String + + def __init__(self, container_type, *args, **kwargs): + super().__init__(*args, **kwargs) + self.container_type = container_type + + def process_bind_param(self, value, dialect): + if isinstance(value, self.container_type): + return json.dumps(value) + elif isinstance(value, str) and isinstance(json.loads(value), self.container_type): # noqa + return value + else: + return TypeError() + + def process_result_value(self, value, dialect): + return json.loads(value) class FileMixin: @@ -61,6 +70,16 @@ class FileMixin: } +class Permission(IntEnum): + ''' + Defines User permissions as integers by the power of 2. User permission + can be evaluated using the bitwise operator &. + ''' + ADMINISTRATE = 1 + CONTRIBUTE = 2 + USE_API = 4 + + class Role(HashidMixin, db.Model): __tablename__ = 'roles' # Primary key @@ -102,7 +121,7 @@ class Role(HashidMixin, db.Model): 'permissions': self.permissions } if relationships: - dict_role['users']: { + dict_role['users'] = { x.to_dict(backrefs=False, relationships=True) for x in self.users } @@ -339,10 +358,11 @@ class TesseractOCRModel(FileMixin, HashidMixin, db.Model): # Foreign keys user_id = db.Column(db.Integer, db.ForeignKey('users.id')) # Fields - compatible_service_versions = db.Column(db.String(255)) + compatible_service_versions = db.Column(ContainerColumn(list, 255)) description = db.Column(db.String(255)) publisher = db.Column(db.String(128)) publishing_year = db.Column(db.Integer) + shared = db.Column(db.Boolean, default=False) title = db.Column(db.String(64)) version = db.Column(db.String(16)) # Backrefs: user: User @@ -356,11 +376,10 @@ class TesseractOCRModel(FileMixin, HashidMixin, db.Model): ) def to_dict(self, backrefs=False, relationships=False): - compatible_service_versions = json.loads(self.compatible_service_versions) # noqa dict_tesseract_ocr_model = { 'id': self.hashid, 'user_id': self.user.hashid, - 'compatible_service_versions': compatible_service_versions, + 'compatible_service_versions': self.compatible_service_versions, 'description': self.description, 'publisher': self.publisher, 'publishing_year': self.publishing_year, @@ -384,31 +403,39 @@ class TesseractOCRModel(FileMixin, HashidMixin, db.Model): with open(defaults_file, 'r') as f: defaults = yaml.safe_load(f) for m in defaults: - if TesseractOCRModel.query.filter_by(title=m['title'], version=m['version']).first() is not None: # noqa + model = TesseractOCRModel.query.filter_by(title=m['title'], version=m['version']).first() # noqa + if model is not None: + model.compatible_service_versions = m['compatible_service_versions'] + model.description = m['description'] + model.publisher = m['publisher'] + model.publishing_year = m['publishing_year'] + model.title = m['title'] + model.version = m['version'] continue - tesseract_ocr_model = TesseractOCRModel( - compatible_service_versions=json.dumps(m['compatible_service_versions']), # noqa + model = TesseractOCRModel( + compatible_service_versions=m['compatible_service_versions'], description=m['description'], publisher=m['publisher'], publishing_year=m['publishing_year'], + shared=True, title=m['title'], user=user, version=m['version'] ) - db.session.add(tesseract_ocr_model) - db.session.flush(objects=[tesseract_ocr_model]) - db.session.refresh(tesseract_ocr_model) - tesseract_ocr_model.filename = f'{tesseract_ocr_model.id}.traineddata' # noqa + db.session.add(model) + db.session.flush(objects=[model]) + db.session.refresh(model) + model.filename = f'{model.id}.traineddata' r = requests.get(m['url'], stream=True) pbar = tqdm( - desc=f'{tesseract_ocr_model.title} ({tesseract_ocr_model.filename})', # noqa + desc=f'{model.title} ({model.filename})', unit="B", unit_scale=True, unit_divisor=1024, total=int(r.headers['Content-Length']) ) pbar.clear() - with open(tesseract_ocr_model.path, 'wb') as f: + with open(model.path, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks pbar.update(len(chunk)) @@ -560,11 +587,7 @@ class Job(HashidMixin, db.Model): description = db.Column(db.String(255)) end_date = db.Column(db.DateTime()) service = db.Column(db.String(64)) - ''' - ' Dictionary as JSON formatted string. - ' Example: {"binarization": True} - ''' - service_args = db.Column(db.String(255)) + service_args = db.Column(ContainerColumn(dict, 255)) service_version = db.Column(db.String(16)) status = db.Column( IntEnumColumn(JobStatus), @@ -643,10 +666,6 @@ class Job(HashidMixin, db.Model): self.status = JobStatus.SUBMITTED def to_dict(self, backrefs=False, relationships=False): - service_args = json.loads(self.service_args) - if self.service == 'tesseract-ocr' and 'model' in service_args: - tesseract_ocr_pipeline_model = TesseractOCRModel.query.get(service_args['model']) # noqa - service_args['model'] = tesseract_ocr_pipeline_model.title dict_job = { 'id': self.hashid, 'user_id': self.user.hashid, @@ -654,7 +673,7 @@ class Job(HashidMixin, db.Model): 'description': self.description, 'end_date': None if self.end_date is None else f'{self.end_date.isoformat()}Z', # noqa 'service': self.service, - 'service_args': service_args, + 'service_args': self.service_args, 'service_version': self.service_version, 'status': self.status.name, 'title': self.title, @@ -798,7 +817,6 @@ class Corpus(HashidMixin, db.Model): title = db.Column(db.String(32)) num_analysis_sessions = db.Column(db.Integer, default=0) num_tokens = db.Column(db.Integer, default=0) - archive_file = db.Column(db.String(255)) # Backrefs: user: User # Relationships files = db.relationship( diff --git a/app/services/forms.py b/app/services/forms.py index 4a16ad4a799ed89a18088ca009050b019f65f0e9..21db025e5de83b65d651a41a5c4c3f887b35c754 100644 --- a/app/services/forms.py +++ b/app/services/forms.py @@ -1,5 +1,7 @@ -from app.models import TesseractOCRModel +from app.models import Job, TesseractOCRModel +from flask_login import current_user from flask_wtf import FlaskForm +from flask_wtf.file import FileField, FileAllowed, FileRequired from wtforms import ( BooleanField, MultipleFileField, @@ -8,110 +10,143 @@ from wtforms import ( SubmitField, ValidationError ) -from wtforms.validators import DataRequired, Length +from wtforms.validators import DataRequired, InputRequired, Length from . import SERVICES class AddJobForm(FlaskForm): - description = StringField('Description', validators=[DataRequired(), Length(1, 255)]) # noqa + description = StringField('Description', validators=[InputRequired()]) # noqa submit = SubmitField() - title = StringField('Title', validators=[DataRequired(), Length(1, 32)]) + title = StringField('Title', validators=[InputRequired()]) version = SelectField('Version', validators=[DataRequired()]) + def validate_description(self, field): + max_length = Job.description.property.columns[0].type.length + if len(field.data) > max_length: + raise ValidationError( + f'Description must be less than {max_length} characters' + ) -class AddSpacyNLPJobForm(AddJobForm): - encoding_detection = BooleanField('Encoding detection') - files = MultipleFileField('Files', validators=[DataRequired()]) - model = SelectField( - 'Model', - choices=[('', 'Choose your option')], - default='', - validators=[DataRequired()] - ) + def validate_title(self, field): + max_length = Job.title.property.columns[0].type.length + if len(field.data) > max_length: + raise ValidationError( + f'Title must be less than {max_length} characters' + ) - def validate_encoding_detection(self, field): - service_info = SERVICES['spacy-nlp']['versions'][self.version.data] - if field.data and 'encoding_detection' not in service_info['methods']: - raise ValidationError('Encoding detection is not available') - def validate_files(form, field): - valid_extensions = ['.txt'] - for file in field.data: - if not file.filename.lower().endswith(tuple(valid_extensions)): - raise ValidationError( - 'File does not have an approved extension: ' - '/'.join(valid_extensions) - ) +class AddFileSetupPipelineJobForm(AddJobForm): + images = MultipleFileField('File(s)', validators=[DataRequired()]) + + def validate_images(form, field): + valid_mimetypes = ['image/jpeg', 'image/png', 'image/tiff'] + for image in field.data: + if image.mimetype not in valid_mimetypes: + raise ValidationError('JPEG, PNG and TIFF files only!') def __init__(self, *args, **kwargs): - version = kwargs.pop('version', SERVICES['spacy-nlp']['latest_version']) # noqa + service_manifest = SERVICES['file-setup-pipeline'] + version = kwargs.pop('version', service_manifest['latest_version']) super().__init__(*args, **kwargs) - service_info = SERVICES['spacy-nlp']['versions'][version] - if 'encoding_detection' not in service_info['methods']: - self.encoding_detection.render_kw = {'disabled': True} - self.model.choices += [(x, y) for x, y in service_info['models'].items()] # noqa - self.version.choices = [(x, x) for x in SERVICES['spacy-nlp']['versions']] # noqa - self.version.default = version + self.version.choices = [(x, x) for x in service_manifest['versions']] + self.version.data = version + self.version.default = service_manifest['latest_version'] -class AddTesseractOCRJobForm(AddJobForm): +class AddTesseractOCRPipelineJobForm(AddJobForm): binarization = BooleanField('Binarization') - files = MultipleFileField('Files', validators=[DataRequired()]) - model = SelectField( - 'Model', - choices=[('', 'Choose your option')], - default='', - validators=[DataRequired()] - ) + pdf = FileField('File', validators=[FileRequired()]) + model = SelectField('Model', validators=[DataRequired()]) def validate_binarization(self, field): - service_info = SERVICES['tesseract-ocr']['versions'][self.version.data] + service_info = SERVICES['tesseract-ocr-pipeline']['versions'][self.version.data] if field.data and 'binarization' not in service_info['methods']: raise ValidationError('Binarization is not available') - def validate_files(self, field): - valid_extensions = ['.pdf'] - for file in field.data: - if not file.filename.lower().endswith(tuple(valid_extensions)): - raise ValidationError( - 'File does not have an approved extension: ' - '/'.join(valid_extensions) - ) + def validate_pdf(self, field): + if field.data.mimetype != 'application/pdf': + raise ValidationError('PDF files only!') def __init__(self, *args, **kwargs): - version = kwargs.pop('version', SERVICES['tesseract-ocr']['latest_version']) # noqa + service_manifest = SERVICES['tesseract-ocr-pipeline'] + version = kwargs.pop('version', service_manifest['latest_version']) super().__init__(*args, **kwargs) - service_info = SERVICES['tesseract-ocr']['versions'][version] + service_info = service_manifest['versions'][version] if 'binarization' not in service_info['methods']: self.binarization.render_kw = {'disabled': True} - self.model.choices += [(x.hashid, x.title) for x in TesseractOCRModel.query.all()] # noqa - self.version.choices = [(x, x) for x in SERVICES['tesseract-ocr']['versions']] # noqa + compatible_models = [ + x for x in TesseractOCRModel.query.filter_by(shared=True).all() + if version in x.compatible_service_versions + ] + compatible_models += [ + x for x in TesseractOCRModel.query.filter_by(shared=False, user=current_user).all() + if version in x.compatible_service_versions + ] + self.model.choices = [('', 'Choose your option')] + self.model.choices += [(x.hashid, x.title) for x in compatible_models] + self.model.default = '' + self.version.choices = [(x, x) for x in service_manifest['versions']] self.version.data = version - self.version.default = SERVICES['tesseract-ocr']['latest_version'] + self.version.default = service_manifest['latest_version'] -class AddFileSetupJobForm(AddJobForm): - files = MultipleFileField('Files', validators=[DataRequired()]) +class AddTranskribusHTRPipelineJobForm(AddJobForm): + binarization = BooleanField('Binarization') + pdf = FileField('File', validators=[FileRequired()]) + model = SelectField('Model', validators=[DataRequired()]) - def validate_files(form, field): - valid_extensions = ['.jpeg', '.jpg', '.png', '.tiff', '.tif'] - for file in field.data: - if not file.filename.lower().endswith(tuple(valid_extensions)): - raise ValidationError( - 'File does not have an approved extension: ' - '/'.join(valid_extensions) - ) + def validate_binarization(self, field): + service_info = SERVICES['transkribus-htr-pipeline']['versions'][self.version.data] + if field.data and 'binarization' not in service_info['methods']: + raise ValidationError('Binarization is not available') + + def validate_pdf(self, field): + if field.data.mimetype != 'application/pdf': + raise ValidationError('PDF files only!') def __init__(self, *args, **kwargs): - version = kwargs.pop('version', SERVICES['file-setup']['latest_version']) # noqa + service_manifest = SERVICES['transkribus-htr-pipeline'] + version = kwargs.pop('version', service_manifest['latest_version']) super().__init__(*args, **kwargs) - self.version.choices = [(x, x) for x in SERVICES['file-setup']['versions']] # noqa + service_info = service_manifest['versions'][version] + if 'binarization' not in service_info['methods']: + self.binarization.render_kw = {'disabled': True} + self.model.choices = [('', 'Choose your option')] + self.model.choices += [ + ('37569', 'Tim Model'), + ('29539', 'UCL–University of Toronto #7') + ] + self.model.default = '' + self.version.choices = [(x, x) for x in service_manifest['versions']] self.version.data = version - self.version.default = SERVICES['file-setup']['latest_version'] + self.version.default = service_manifest['latest_version'] + + +class AddSpacyNLPPipelineJobForm(AddJobForm): + encoding_detection = BooleanField('Encoding detection') + txt = FileField('File', validators=[FileRequired()]) + model = SelectField('Model', validators=[DataRequired()]) + + def validate_encoding_detection(self, field): + service_manifest = SERVICES['spacy-nlp-pipeline'] + service_info = service_manifest['versions'][self.version.data] + if field.data and 'encoding_detection' not in service_info['methods']: + raise ValidationError('Encoding detection is not available!') + def validate_txt(form, field): + if field.data.mimetype != 'text/plain': + raise ValidationError('Plain text files only!') -AddJobForms = { - 'file-setup': AddFileSetupJobForm, - 'tesseract-ocr': AddTesseractOCRJobForm, - 'spacy-nlp': AddSpacyNLPJobForm -} + def __init__(self, *args, **kwargs): + service_manifest = SERVICES['spacy-nlp-pipeline'] + version = kwargs.pop('version', service_manifest['latest_version']) + super().__init__(*args, **kwargs) + service_info = service_manifest['versions'][version] + if 'encoding_detection' not in service_info['methods']: + self.encoding_detection.render_kw = {'disabled': True} + self.model.choices = [('', 'Choose your option')] + self.model.choices += [(x, y) for x, y in service_info['models'].items()] # noqa + self.model.default = '' + self.version.choices = [(x, x) for x in service_manifest['versions']] + self.version.data = version + self.version.default = version diff --git a/app/services/routes.py b/app/services/routes.py index 7a8e520ebcff52d1f270753b351160c11e91a0a9..feecf39ad6c91097d7b5eae9d230339253f6e522 100644 --- a/app/services/routes.py +++ b/app/services/routes.py @@ -13,47 +13,33 @@ from flask_login import current_user, login_required from werkzeug.utils import secure_filename from . import bp from . import SERVICES -from .forms import AddJobForms +from .forms import ( + AddFileSetupPipelineJobForm, + AddTesseractOCRPipelineJobForm, + AddTranskribusHTRPipelineJobForm, + AddSpacyNLPPipelineJobForm +) import json -@bp.route('/corpus-analysis') -@login_required -def corpus_analysis(): - return render_template( - 'services/corpus_analysis.html.j2', - title='Corpus analysis' - ) - - -@bp.route('/<service>', methods=['GET', 'POST']) +@bp.route('/file-setup-pipeline', methods=['GET', 'POST']) @login_required -def service(service): - # Check if the requested service exist - if service not in SERVICES or service not in AddJobForms: - abort(404) - version = request.args.get('version', SERVICES[service]['latest_version']) - if version not in SERVICES[service]['versions']: +def file_setup_pipeline(): + service = 'file-setup-pipeline' + service_manifest = SERVICES[service] + version = request.args.get('version', service_manifest['latest_version']) + if version not in service_manifest['versions']: abort(404) - form = AddJobForms[service](prefix='add-job-form', version=version) - title = SERVICES[service]['name'] + form = AddFileSetupPipelineJobForm(prefix='add-job-form', version=version) if form.is_submitted(): if not form.validate(): return make_response(form.errors, 400) service_args = {} - if service == 'spacy-nlp': - service_args['model'] = form.model.data - if form.encoding_detection.data: - service_args['encoding_detection'] = True - if service == 'tesseract-ocr': - service_args['model'] = hashids.decode(form.model.data) - if form.binarization.data: - service_args['binarization'] = True job = Job( user=current_user, description=form.description.data, service=service, - service_args=json.dumps(service_args), + service_args=service_args, service_version=form.version.data, title=form.title.data ) @@ -67,18 +53,17 @@ def service(service): db.session.rollback() flash('Internal Server Error', 'error') return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa - for file in form.files.data: - filename = secure_filename(file.filename) + for image_file in form.images.data: job_input = JobInput( - filename=filename, + filename=secure_filename(image_file.filename), job=job, - mimetype=file.mimetype + mimetype=image_file.mimetype ) db.session.add(job_input) db.session.flush(objects=[job_input]) db.session.refresh(job_input) try: - file.save(job_input.path) + image_file.save(job_input.path) except OSError as e: current_app.logger.error(e) db.session.rollback() @@ -91,5 +76,196 @@ def service(service): return render_template( f'services/{service.replace("-", "_")}.html.j2', form=form, - title=title + title=service_manifest['name'] + ) + + +@bp.route('/tesseract-ocr-pipeline', methods=['GET', 'POST']) +@login_required +def tesseract_ocr_pipeline(): + service = 'tesseract-ocr-pipeline' + service_manifest = SERVICES[service] + version = request.args.get('version', service_manifest['latest_version']) + if version not in service_manifest['versions']: + abort(404) + form = AddTesseractOCRPipelineJobForm(prefix='add-job-form', version=version) + if form.is_submitted(): + if not form.validate(): + return make_response(form.errors, 400) + service_args = {} + service_args['model'] = hashids.decode(form.model.data) + if form.binarization.data: + service_args['binarization'] = True + job = Job( + user=current_user, + description=form.description.data, + service=service, + service_args=service_args, + service_version=form.version.data, + title=form.title.data + ) + db.session.add(job) + db.session.flush(objects=[job]) + db.session.refresh(job) + try: + job.makedirs() + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + flash('Internal Server Error', 'error') + return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa + job_input = JobInput( + filename=secure_filename(form.pdf.data.filename), + job=job, + mimetype=form.pdf.data.mimetype + ) + db.session.add(job_input) + db.session.flush(objects=[job_input]) + db.session.refresh(job_input) + try: + form.pdf.data.save(job_input.path) + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + flash('Internal Server Error', 'error') + return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa + job.status = JobStatus.SUBMITTED + db.session.commit() + flash(f'Job "{job.title}" added', 'job') + return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) # noqa + return render_template( + f'services/{service.replace("-", "_")}.html.j2', + form=form, + title=service_manifest['name'] + ) + + +@bp.route('/transkribus-htr-pipeline', methods=['GET', 'POST']) +@login_required +def transkribus_htr_pipeline(): + if not current_app.config.get('NOPAQUE_TRANSKRIBUS_ENABLED'): + abort(404) + service = 'transkribus-htr-pipeline' + service_manifest = SERVICES[service] + version = request.args.get('version', service_manifest['latest_version']) + if version not in service_manifest['versions']: + abort(404) + form = AddTranskribusHTRPipelineJobForm(prefix='add-job-form', version=version) + if form.is_submitted(): + if not form.validate(): + return make_response(form.errors, 400) + service_args = {} + service_args['model'] = form.model.data + if form.binarization.data: + service_args['binarization'] = True + job = Job( + user=current_user, + description=form.description.data, + service=service, + service_args=service_args, + service_version=form.version.data, + title=form.title.data + ) + db.session.add(job) + db.session.flush(objects=[job]) + db.session.refresh(job) + try: + job.makedirs() + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + flash('Internal Server Error', 'error') + return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa + job_input = JobInput( + filename=secure_filename(form.pdf.data.filename), + job=job, + mimetype=form.pdf.data.mimetype + ) + db.session.add(job_input) + db.session.flush(objects=[job_input]) + db.session.refresh(job_input) + try: + form.pdf.data.save(job_input.path) + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + flash('Internal Server Error', 'error') + return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa + job.status = JobStatus.SUBMITTED + db.session.commit() + flash(f'Job "{job.title}" added', 'job') + return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) # noqa + return render_template( + f'services/{service.replace("-", "_")}.html.j2', + form=form, + title=service_manifest['name'] ) + + +@bp.route('/spacy-nlp-pipeline', methods=['GET', 'POST']) +@login_required +def spacy_nlp_pipeline(): + service = 'spacy-nlp-pipeline' + service_manifest = SERVICES[service] + version = request.args.get('version', SERVICES[service]['latest_version']) + if version not in service_manifest['versions']: + abort(404) + form = AddSpacyNLPPipelineJobForm(prefix='add-job-form', version=version) + if form.is_submitted(): + if not form.validate(): + return make_response(form.errors, 400) + service_args = {} + service_args['model'] = form.model.data + if form.encoding_detection.data: + service_args['encoding_detection'] = True + job = Job( + user=current_user, + description=form.description.data, + service=service, + service_args=service_args, + service_version=form.version.data, + title=form.title.data + ) + db.session.add(job) + db.session.flush(objects=[job]) + db.session.refresh(job) + try: + job.makedirs() + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + flash('Internal Server Error', 'error') + return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa + job_input = JobInput( + filename=secure_filename(form.txt.data.filename), + job=job, + mimetype=form.txt.data.mimetype + ) + db.session.add(job_input) + db.session.flush(objects=[job_input]) + db.session.refresh(job_input) + try: + form.txt.data.save(job_input.path) + except OSError as e: + current_app.logger.error(e) + db.session.rollback() + flash('Internal Server Error', 'error') + return make_response({'redirect_url': url_for('.service', service=service)}, 500) # noqa + job.status = JobStatus.SUBMITTED + db.session.commit() + flash(f'Job "{job.title}" added', 'job') + return make_response({'redirect_url': url_for('jobs.job', job_id=job.id)}, 201) # noqa + return render_template( + f'services/{service.replace("-", "_")}.html.j2', + form=form, + title=service_manifest['name'] + ) + + +@bp.route('/corpus-analysis') +@login_required +def corpus_analysis(): + return render_template( + 'services/corpus_analysis.html.j2', + title='Corpus analysis' + ) \ No newline at end of file diff --git a/app/services/services.yml b/app/services/services.yml index 0c82c3d91c37bc0ae292fba24188ab1462972a96..c26f7fb7025898dbe982f1e9de0f759714c6cfde 100644 --- a/app/services/services.yml +++ b/app/services/services.yml @@ -1,38 +1,70 @@ # TODO: This could also be done via GitLab/GitHub APIs -#file-setup-pipeline: -file-setup: +file-setup-pipeline: name: 'File setup pipeline' + publisher: 'Bielefeld University - CRC 1288 - INF' latest_version: '0.1.0' versions: 0.1.0: - publisher: 'Bielefeld University - CRC 1288 - INF' publishing_year: 2022 - url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup/-/releases/v0.1.0' -#spacy-nlp-pipeline: -spacy-nlp: - name: 'spaCy NLP' + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup-pipeline/-/releases/v0.1.0' +tesseract-ocr-pipeline: + name: 'Tesseract OCR Pipeline' + publisher: 'Bielefeld University - CRC 1288 - INF' + latest_version: '0.1.4' + versions: + 0.1.0: + methods: + - 'binarization' + publishing_year: 2022 + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.0' + 0.1.1: + methods: + - 'binarization' + publishing_year: 2022 + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.1' + 0.1.2: + methods: + - 'binarization' + publishing_year: 2022 + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.2' + 0.1.3: + methods: + - 'binarization' + publishing_year: 2022 + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.3' + 0.1.4: + methods: + - 'binarization' + publishing_year: 2022 + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline/-/releases/v0.1.4' +transkribus-htr-pipeline: + name: 'Transkribus HTR Pipeline' + publisher: 'Bielefeld University - CRC 1288 - INF' + latest_version: '0.1.0' + versions: + 0.1.0: + methods: + - 'binarization' + publishing_year: 2022 + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/transkribus-htr-pipeline/-/releases/v0.1.0' +spacy-nlp-pipeline: + name: 'spaCy NLP Pipeline' + publisher: 'Bielefeld University - CRC 1288 - INF' latest_version: '0.1.0' versions: 0.1.0: methods: - 'encoding_detection' models: + ca: 'Catalan' de: 'German' + el: 'Greek' en: 'English' + es: 'Spanish' + fr: 'French' it: 'Italian' pl: 'Polish' + ru: 'Russian' zh: 'Chinese' - publisher: 'Bielefeld University - CRC 1288 - INF' - publishing_year: 2022 - url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp/-/releases/v0.1.0' -#tesseract-ocr-pipeline: -tesseract-ocr: - name: 'Tesseract OCR' - latest_version: '0.1.0' - versions: - 0.1.0: - methods: - - 'binarization' - publisher: 'Bielefeld University - CRC 1288 - INF' publishing_year: 2022 - url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr/-/releases/v0.1.0' + url: 'https://gitlab.ub.uni-bielefeld.de/sfb1288inf/spacy-nlp-pipeline/-/releases/v0.1.0' \ No newline at end of file diff --git a/app/static/css/colors.scss b/app/static/css/colors.scss index 230efe4ba585f6e64e5939e2fbdaec1ad65d9a3d..0dfe27d54f8b18fb529dfd631233bec930abb88a 100644 --- a/app/static/css/colors.scss +++ b/app/static/css/colors.scss @@ -28,20 +28,25 @@ $color: ( "darken": #6b3f89, "lighten": #ebe8f6 ), - "file-setup": ( + "file-setup-pipeline": ( "base": #d5dc95, "darken": #a1b300, "lighten": #f2f3e1 ), - "spacy-nlp": ( + "spacy-nlp-pipeline": ( "base": #98acd2, "darken": #0064a3, "lighten": #e5e8f5 ), - "tesseract-ocr": ( + "tesseract-ocr-pipeline": ( "base": #a9d8c8, "darken": #00a58b, "lighten": #e7f4f1 + ), + "transkribus-htr-pipeline": ( + "base": #607d8b, + "darken": #37474f, + "lighten": #cfd8dc ) ), "status": ( diff --git a/app/static/css/style.css b/app/static/css/style.css index 928307258197f7f7c80bc271099188c500ad6e68..0a6c575a867b58f4fa54c235fb9a7aa998a4510f 100644 --- a/app/static/css/style.css +++ b/app/static/css/style.css @@ -43,9 +43,10 @@ h1 .nopaque-icons, h2 .nopaque-icons, h3 .nopaque-icons, h4 .nopaque-icons, .tab .job-status-text {text-transform: lowercase;} .job-status-text[data-job-status]:empty:before {content: attr(data-job-status);} -.nopaque-icons.service-icon[data-service="file-setup"]:empty:before {content: "E";} -.nopaque-icons.service-icon[data-service="tesseract-ocr"]:empty:before {content: "F";} -.nopaque-icons.service-icon[data-service="spacy-nlp"]:empty:before {content: "G";} +.nopaque-icons.service-icon[data-service="file-setup-pipeline"]:empty:before {content: "E";} +.nopaque-icons.service-icon[data-service="tesseract-ocr-pipeline"]:empty:before {content: "F";} +.nopaque-icons.service-icon[data-service="transkribus-htr-pipeline"]:empty:before {content: "F";} +.nopaque-icons.service-icon[data-service="spacy-nlp-pipeline"]:empty:before {content: "G";} .nopaque-icons.service-icon[data-service="corpus-analysis"]:empty:before {content: "H";} .hoverable {cursor: pointer;} diff --git a/app/templates/_roadmap.html.j2 b/app/templates/_roadmap.html.j2 index e8265ba0a2e354d8a69d7dbcf7b76874e9bc35b1..3e87d6079d413dbc80598e9bc575685b37b209ad 100644 --- a/app/templates/_roadmap.html.j2 +++ b/app/templates/_roadmap.html.j2 @@ -3,11 +3,13 @@ <h2>Roadmap</h2> <p>The roadmap guides you through nopaque's workflow! If you have the necessary input fie formats, you can directly jump into the corresponding process. If not, you can use the roadmap to jump right to the preceding process.</p> <ul class="tabs tabs-fixed-width"> - <li class="tab"><a{%if request.path == url_for('services.service', service='file-setup') %} class="active"{% endif %} href="{{ url_for('services.service', service='file-setup') }}" target="_self">File setup</a></li> + <li class="tab"><a{%if request.path == url_for('services.file_setup_pipeline') %} class="active"{% endif %} href="{{ url_for('services.file_setup_pipeline') }}" target="_self">File setup</a></li> <li class="tab disabled"><i class="material-icons">navigate_next</i></li> - <li class="tab"><a{%if request.path == url_for('services.service', service='tesseract-ocr') %} class="active"{% endif %} href="{{ url_for('services.service', service='tesseract-ocr') }}" target="_self">OCR</a></li> + <li class="tab"><a{%if request.path == url_for('services.tesseract_ocr_pipeline') %} class="active"{% endif %} href="{{ url_for('services.tesseract_ocr_pipeline') }}" target="_self">OCR</a></li> + <li class="tab disabled"><i class="material-icons">more_vert</i></li> + <li class="tab"><a{%if request.path == url_for('services.transkribus_htr_pipeline') %} class="active"{% endif %} href="{{ url_for('services.transkribus_htr_pipeline') }}" target="_self">HTR</a></li> <li class="tab disabled"><i class="material-icons">navigate_next</i></li> - <li class="tab"><a{%if request.path == url_for('services.service', service='spacy-nlp') %} class="active"{% endif %} href="{{ url_for('services.service', service='spacy-nlp') }}" target="_self">NLP</a></li> + <li class="tab"><a{%if request.path == url_for('services.spacy_nlp_pipeline') %} class="active"{% endif %} href="{{ url_for('services.spacy_nlp_pipeline') }}" target="_self">NLP</a></li> <li class="tab disabled"><i class="material-icons">navigate_next</i></li> <li class="tab"><a{%if request.path == url_for('corpora.add_corpus') %} class="active"{% endif %} href="{{ url_for('corpora.add_corpus') }}" target="_self">Add corpus</a></li> <li class="tab disabled"><i class="material-icons">navigate_next</i></li> diff --git a/app/templates/_sidenav.html.j2 b/app/templates/_sidenav.html.j2 index 8729f4f8a81d0dcb52b14a775c5405a521301922..c246b6bde255b2a9e485fd8159c2eda6bba4aac8 100644 --- a/app/templates/_sidenav.html.j2 +++ b/app/templates/_sidenav.html.j2 @@ -14,10 +14,13 @@ <li><a href="{{ url_for('main.dashboard', _anchor='jobs') }}" style="padding-left: 47px;"><i class="nopaque-icons">J</i>My Jobs</a></li> <li><div class="divider"></div></li> <li><a class="subheader">Processes & Services</a></li> - <li class="service-color service-color-border border-darken" data-service="file-setup" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='file-setup') }}"><i class="nopaque-icons service-icon" data-service="file-setup"></i>File setup</a></li> - <li class="service-color service-color-border border-darken" data-service="tesseract-ocr" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='tesseract-ocr') }}"><i class="nopaque-icons service-icon" data-service="tesseract-ocr"></i>OCR</a></li> - <li class="service-color service-color-border border-darken" data-service="spacy-nlp" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='spacy-nlp') }}"><i class="nopaque-icons service-icon" data-service="spacy-nlp"></i>NLP</a></li> - <li class="service-color service-color-border border-darken" data-service="corpus-analysis" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.service', service='corpus-analysis') }}"><i class="nopaque-icons service-icon" data-service="corpus-analysis"></i>Corpus analysis</a></li> + <li class="service-color service-color-border border-darken" data-service="file-setup-pipeline" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.file_setup_pipeline') }}"><i class="nopaque-icons service-icon" data-service="file-setup-pipeline"></i>File setup</a></li> + <li class="service-color service-color-border border-darken" data-service="tesseract-ocr-pipeline" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.tesseract_ocr_pipeline') }}"><i class="nopaque-icons service-icon" data-service="tesseract-ocr-pipeline"></i>OCR</a></li> + {% if config.NOPAQUE_TRANSKRIBUS_ENABLED %} + <li class="service-color service-color-border border-darken" data-service="transkribus-htr-pipeline" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.transkribus_htr_pipeline') }}"><i class="nopaque-icons service-icon" data-service="transkribus-htr-pipeline"></i>HTR</a></li> + {% endif %} + <li class="service-color service-color-border border-darken" data-service="spacy-nlp-pipeline" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.spacy_nlp_pipeline') }}"><i class="nopaque-icons service-icon" data-service="spacy-nlp-pipeline"></i>NLP</a></li> + <li class="service-color service-color-border border-darken" data-service="corpus-analysis" style="border-left: 10px solid; margin-top: 5px;"><a href="{{ url_for('services.corpus_analysis') }}"><i class="nopaque-icons service-icon" data-service="corpus-analysis"></i>Corpus analysis</a></li> <li><div class="divider"></div></li> <li><a class="subheader">Account</a></li> <li><a href="{{ url_for('settings.index') }}"><i class="material-icons">settings</i>Settings</a></li> diff --git a/app/templates/main/dashboard.html.j2 b/app/templates/main/dashboard.html.j2 index 7bac5c03db8adb04cbcbfd5e28dd24147d6d9197..00695d69cefbb9c8fe3eac63e12676aaaec0f1c0 100644 --- a/app/templates/main/dashboard.html.j2 +++ b/app/templates/main/dashboard.html.j2 @@ -115,37 +115,37 @@ <div class="col s12 m4"> <div class="card-panel center-align hoverable"> <br> - <a href="{{ url_for('services.service', service='file-setup') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);"> - <i class="nopaque-icons service-color darken service-icon" data-service="file-setup"></i> + <a href="{{ url_for('services.file_setup_pipeline') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);"> + <i class="nopaque-icons service-color darken service-icon" data-service="file-setup-pipeline"></i> </a> <br><br> - <p class="service-color-text darken" data-service="file-setup"><b>File setup</b></p> + <p class="service-color-text darken" data-service="file-setup-pipeline"><b>File setup</b></p> <p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing.</p> - <a href="{{ url_for('services.service', service='file-setup') }}" class="waves-effect waves-light btn service-color darken" data-service="file-setup">Create Job</a> + <a href="{{ url_for('services.file_setup_pipeline') }}" class="waves-effect waves-light btn service-color darken" data-service="file-setup-pipeline">Create Job</a> </div> </div> <div class="col s12 m4"> <div class="card-panel center-align hoverable"> <br> - <a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);"> - <i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr" style="font-size: 2.5rem;"></i> + <a href="{{ url_for('services.tesseract_ocr_pipeline') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);"> + <i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr-pipeline" style="font-size: 2.5rem;"></i> </a> <br><br> - <p class="service-color-text darken" data-service="tesseract-ocr"><b>Optical Character Recognition</b></p> + <p class="service-color-text darken" data-service="tesseract-ocr-pipeline"><b>Optical Character Recognition</b></p> <p class="light">nopaque converts your image data – like photos or scans – into text data through a process called OCR. This step enables you to proceed with further computational analysis of your documents.</p> - <a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="waves-effect waves-light btn service-color darken" data-service="tesseract-ocr">Create Job</a> + <a href="{{ url_for('services.tesseract_ocr_pipeline') }}" class="waves-effect waves-light btn service-color darken" data-service="tesseract-ocr-pipeline">Create Job</a> </div> </div> <div class="col s12 m4"> <div class="card-panel center-align hoverable"> <br> - <a href="{{ url_for('services.service', service='spacy-nlp') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);"> - <i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp" style="font-size: 2.5rem;"></i> + <a href="{{ url_for('services.spacy_nlp_pipeline') }}" class="btn-floating btn-large waves-effect waves-light" style="transform: scale(2);"> + <i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp-pipeline" style="font-size: 2.5rem;"></i> </a> <br><br> - <p class="service-color-text darken" data-service="spacy-nlp"><b>Natural Language Processing</b></p> + <p class="service-color-text darken" data-service="spacy-nlp-pipeline"><b>Natural Language Processing</b></p> <p class="light">By means of computational linguistic data processing (tokenization, lemmatization, part-of-speech tagging and named-entity recognition) nopaque extracts additional information from your text.</p> - <a href="{{ url_for('services.service', service='spacy-nlp') }}" class="waves-effect waves-light btn service-color darken" data-service="spacy-nlp">Create Job</a> + <a href="{{ url_for('services.spacy_nlp_pipeline') }}" class="waves-effect waves-light btn service-color darken" data-service="spacy-nlp-pipeline">Create Job</a> </div> </div> </div> diff --git a/app/templates/main/faq.html.j2 b/app/templates/main/faq.html.j2 index 86dde843b49ba0e43698b056d0865533cc776ed0..29ac92714cfc83f26db47f5d3bd83436a95a6a4e 100644 --- a/app/templates/main/faq.html.j2 +++ b/app/templates/main/faq.html.j2 @@ -35,9 +35,9 @@ <p>Our source code is spread over multiple Git repositories.</p> <ul> <li>nopaque frontend: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nopaque</a></li> - <li>File setup: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup</a></li> - <li>OCR: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/ocr</a></li> - <li>NLP: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/nlp</a></li> + <li>File Setup Pipeline: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup-pipeline" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/file-setup-pipeline</a></li> + <li>Tesseract OCR Pipeline: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/tesseract-ocr-pipeline</a></li> + <li>spaCy NLP Pipeline: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/spacy-nlp-pipeline" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/spacy-nlp-pipeline</a></li> <li>Corpus analysis backend: <a href="https://gitlab.ub.uni-bielefeld.de/sfb1288inf/cqpserver" target="_blank">https://gitlab.ub.uni-bielefeld.de/sfb1288inf/cqpserver</a></li> <li>Corpus analysis backend connector: <a href="https://github.com/Pevtrick/cqi-py" target="_blank">https://github.com/Pevtrick/cqi-py</a></li> </ul> diff --git a/app/templates/main/index.html.j2 b/app/templates/main/index.html.j2 index 80ebd02fb80d813e2a84fcdf6b132516407fe06c..6cc285d0a2c254247a3da38670f9513609c2c30d 100644 --- a/app/templates/main/index.html.j2 +++ b/app/templates/main/index.html.j2 @@ -76,31 +76,31 @@ <p class="hide-on-small-only"> </p> <div class="row"> <div class="col s12 m6 l3 center-align"> - <a href="{{ url_for('services.service', service='file-setup') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> - <i class="nopaque-icons service-color darken service-icon" data-service="file-setup"></i> + <a href="{{ url_for('services.file_setup_pipeline') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> + <i class="nopaque-icons service-color darken service-icon" data-service="file-setup-pipeline"></i> </a> <br><br> - <p class="service-color-text text-darken" data-service="file-setup"><b>File setup</b></p> + <p class="service-color-text text-darken" data-service="file-setup-pipeline"><b>File setup</b></p> <p class="light">Digital copies of text based research data (books, letters, etc.) often comprise various files and formats. nopaque converts and merges those files to facilitate further processing and the application of other services.</p> </div> <div class="col s12 m6 l3 center-align"> - <a href="{{ url_for('services.service', service='tesseract-ocr') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> - <i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr"></i> + <a href="{{ url_for('services.tesseract_ocr_pipeline') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> + <i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr-pipeline"></i> </a> <br><br> - <p class="service-color-text text-darken" data-service="tesseract-ocr"><b>Optical Character Recognition</b></p> + <p class="service-color-text text-darken" data-service="tesseract-ocr-pipeline"><b>Optical Character Recognition</b></p> <p class="light">nopaque converts your image data – like photos or scans – into text data through OCR making it machine readable. This step enables you to proceed with further computational analysis of your documents.</p> </div> <div class="col s12 m6 l3 center-align"> - <a href="{{ url_for('services.service', service='spacy-nlp') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> - <i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp"></i> + <a href="{{ url_for('services.spacy_nlp_pipeline') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> + <i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp-pipeline"></i> </a> <br><br> - <p class="service-color-text text-darken" data-service="spacy-nlp"><b>Natural Language Processing</b></p> + <p class="service-color-text text-darken" data-service="spacy-nlp-pipeline"><b>Natural Language Processing</b></p> <p class="light">By means of computational linguistic data processing (tokenization, lemmatization, part-of-speech tagging and named-entity recognition) nopaque extracts additional information from your text.</p> </div> <div class="col s12 m6 l3 center-align"> - <a href="{{ url_for('services.service', service='corpus_analysis') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> + <a href="{{ url_for('services.corpus_analysis') }}" class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> <i class="nopaque-icons service-color darken service-icon" data-service="corpus-analysis"></i> </a> <br><br> diff --git a/app/templates/services/_breadcrumbs.html.j2 b/app/templates/services/_breadcrumbs.html.j2 index d819fe60a41942bab297bd425ea8ee557046a348..a08beafab6595a87910f44350e8a569205a7a79c 100644 --- a/app/templates/services/_breadcrumbs.html.j2 +++ b/app/templates/services/_breadcrumbs.html.j2 @@ -2,13 +2,15 @@ <li class="tab disabled"><i class="material-icons">navigate_next</i></li> <li class="tab"><a href="{{ url_for('main.index', _anchor='services') }}" target="_self">Processes & Services</a></li> <li class="tab disabled"><i class="material-icons">navigate_next</i></li> -{% if request.path == url_for('.service', service='corpus-analysis') %} -<li class="tab"><a class="active" href="{{ url_for('.service', service='corpus-analysis') }}" target="_self">{{ title }}</a></li> -{% elif request.path == url_for('.service', service='file-setup') %} -<li class="tab"><a class="active" href="{{ url_for('.service', service='file-setup') }}" target="_self">{{ title }}</a></li> -{% elif request.path == url_for('.service', service='nlp') %} -<li class="tab"><a class="active" href="{{ url_for('.service', service='nlp') }}" target="_self">{{ title }}</a></li> -{% elif request.path == url_for('.service', service='ocr') %} -<li class="tab"><a class="active" href="{{ url_for('.service', service='ocr') }}" target="_self">{{ title }}</a></li> +{% if request.path == url_for('.corpus_analysis') %} +<li class="tab"><a class="active" href="{{ url_for('.corpus_analysis') }}" target="_self">{{ title }}</a></li> +{% elif request.path == url_for('.file_setup_pipeline') %} +<li class="tab"><a class="active" href="{{ url_for('.file_setup_pipeline') }}" target="_self">{{ title }}</a></li> +{% elif request.path == url_for('.spacy_nlp_pipeline') %} +<li class="tab"><a class="active" href="{{ url_for('.spacy_nlp_pipeline') }}" target="_self">{{ title }}</a></li> +{% elif request.path == url_for('.tesseract_ocr_pipeline') %} +<li class="tab"><a class="active" href="{{ url_for('.tesseract_ocr_pipeline') }}" target="_self">{{ title }}</a></li> +{% elif request.path == url_for('.transkribus_htr_pipeline') %} +<li class="tab"><a class="active" href="{{ url_for('.transkribus_htr_pipeline') }}" target="_self">{{ title }}</a></li> {% endif %} {% endset %} diff --git a/app/templates/services/file_setup.html.j2 b/app/templates/services/file_setup_pipeline.html.j2 similarity index 89% rename from app/templates/services/file_setup.html.j2 rename to app/templates/services/file_setup_pipeline.html.j2 index 9e70288c4f030e5bf1b1d566c7cb577d4f16ff9d..ce3e21e8fa005991aa190505abab2ee7fc58fa5d 100644 --- a/app/templates/services/file_setup.html.j2 +++ b/app/templates/services/file_setup_pipeline.html.j2 @@ -2,7 +2,7 @@ {% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %} {% import "materialize/wtf.html.j2" as wtf %} -{% block main_attribs %} class="service-scheme" data-service="file-setup"{% endblock main_attribs %} +{% block main_attribs %} class="service-scheme" data-service="file-setup-pipeline"{% endblock main_attribs %} {% block page_content %} <div class="container"> @@ -16,13 +16,13 @@ <p class="hide-on-small-only"> </p> <p class="hide-on-small-only"> </p> <a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> - <i class="nopaque-icons service-color darken service-icon" data-service="file-setup"></i> + <i class="nopaque-icons service-color darken service-icon" data-service="file-setup-pipeline"></i> </a> </div> </div> <div class="col s12 m9 pull-m3"> - <div class="card service-color-border border-darken" data-service="file-setup" style="border-top: 10px solid;"> + <div class="card service-color-border border-darken" data-service="file-setup-pipeline" style="border-top: 10px solid;"> <div class="card-content"> <div class="row"> <div class="col s12"> @@ -50,7 +50,7 @@ {{ wtf.render_field(form.description, data_length='255', material_icon='description') }} </div> <div class="col s12 l9"> - {{ wtf.render_field(form.files, accept='image/jpeg, image/png, image/tiff', placeholder='Choose your .jpeg, .png or .tiff files') }} + {{ wtf.render_field(form.images, accept='image/jpeg, image/png, image/tiff', placeholder='Choose JPEG, PNG or TIFF files') }} </div> <div class="col s12 l3"> {{ wtf.render_field(form.version, material_icon='apps') }} diff --git a/app/templates/services/spacy_nlp.html.j2 b/app/templates/services/spacy_nlp_pipeline.html.j2 similarity index 95% rename from app/templates/services/spacy_nlp.html.j2 rename to app/templates/services/spacy_nlp_pipeline.html.j2 index 30fab84cff34a8a42d8d9bd78e208d4fcc6cca8c..5c911cd9163538848606aa3cce89f401ecbbfe7b 100644 --- a/app/templates/services/spacy_nlp.html.j2 +++ b/app/templates/services/spacy_nlp_pipeline.html.j2 @@ -2,7 +2,7 @@ {% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %} {% import "materialize/wtf.html.j2" as wtf %} -{% block main_attribs %} class="service-scheme" data-service="spacy-nlp"{% endblock main_attribs %} +{% block main_attribs %} class="service-scheme" data-service="spacy-nlp-pipeline"{% endblock main_attribs %} {% block page_content %} <div class="container"> @@ -16,13 +16,13 @@ <p class="hide-on-small-only"> </p> <p class="hide-on-small-only"> </p> <a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> - <i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp"></i> + <i class="nopaque-icons service-color darken service-icon" data-service="spacy-nlp-pipeline"></i> </a> </div> </div> <div class="col s12 m9 pull-m3"> - <div class="card service-color-border border-darken" data-service="spacy-nlp" style="border-top: 10px solid;"> + <div class="card service-color-border border-darken" data-service="spacy-nlp-pipeline" style="border-top: 10px solid;"> <div class="card-content"> <div class="row"> <div class="col s12 m6"> @@ -68,7 +68,7 @@ {{ wtf.render_field(form.description, data_length='255', material_icon='description') }} </div> <div class="col s12 l5"> - {{ wtf.render_field(form.files, accept='text/plain', placeholder='Choose your .txt files') }} + {{ wtf.render_field(form.txt, accept='text/plain', placeholder='Choose a plain text file') }} </div> <div class="col s12 l4"> {{ wtf.render_field(form.model, material_icon='language') }} diff --git a/app/templates/services/tesseract_ocr.html.j2 b/app/templates/services/tesseract_ocr_pipeline.html.j2 similarity index 95% rename from app/templates/services/tesseract_ocr.html.j2 rename to app/templates/services/tesseract_ocr_pipeline.html.j2 index 6612128105838ffb8dd9ddb7a8e4b248f9a5a121..c14a8c71ab19ccb899a01c7965df5dad43a9c7ca 100644 --- a/app/templates/services/tesseract_ocr.html.j2 +++ b/app/templates/services/tesseract_ocr_pipeline.html.j2 @@ -2,7 +2,7 @@ {% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %} {% import "materialize/wtf.html.j2" as wtf %} -{% block main_attribs %} class="service-scheme" data-service="tesseract-ocr"{% endblock main_attribs %} +{% block main_attribs %} class="service-scheme" data-service="tesseract-ocr-pipeline"{% endblock main_attribs %} {% block page_content %} <div class="container"> @@ -16,13 +16,13 @@ <p class="hide-on-small-only"> </p> <p class="hide-on-small-only"> </p> <a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> - <i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr"></i> + <i class="nopaque-icons service-color darken service-icon" data-service="tesseract-ocr-pipeline"></i> </a> </div> </div> <div class="col s12 m9 pull-m3"> - <div class="card service-color-border border-darken" data-service="tesseract-ocr" style="border-top: 10px solid;"> + <div class="card service-color-border border-darken" data-service="tesseract-ocr-pipeline" style="border-top: 10px solid;"> <div class="card-content"> <div class="row"> <div class="col s12"> @@ -50,7 +50,7 @@ {{ wtf.render_field(form.description, data_length='255', material_icon='description') }} </div> <div class="col s12 l5"> - {{ wtf.render_field(form.files, accept='application/pdf', placeholder='Choose your .pdf files') }} + {{ wtf.render_field(form.pdf, accept='application/pdf', placeholder='Choose a PDF file') }} </div> <div class="col s12 l4"> {{ wtf.render_field(form.model, material_icon='language') }} diff --git a/app/templates/services/transkribus_htr_pipeline.html.j2 b/app/templates/services/transkribus_htr_pipeline.html.j2 new file mode 100644 index 0000000000000000000000000000000000000000..f91d0bf555f8cd7a055fb25faa3b907687f9b56c --- /dev/null +++ b/app/templates/services/transkribus_htr_pipeline.html.j2 @@ -0,0 +1,169 @@ +{% extends "base.html.j2" %} +{% from "services/_breadcrumbs.html.j2" import breadcrumbs with context %} +{% import "materialize/wtf.html.j2" as wtf %} + +{% block main_attribs %} class="service-scheme" data-service="transkribus-htr-pipeline"{% endblock main_attribs %} + +{% block page_content %} +<div class="container"> + <div class="row"> + <div class="col s12"> + <h1 id="title">{{ title }}</h1> + </div> + + <div class="col s12 m3 push-m9"> + <div class="center-align"> + <p class="hide-on-small-only"> </p> + <p class="hide-on-small-only"> </p> + <a class="btn-floating btn-large btn-scale-x2 waves-effect waves-light"> + <i class="nopaque-icons service-color darken service-icon" data-service="transkribus-htr-pipeline"></i> + </a> + </div> + </div> + + <div class="col s12 m9 pull-m3"> + <div class="card service-color-border border-darken" data-service="transkribus-htr-pipeline" style="border-top: 10px solid;"> + <div class="card-content"> + <div class="row"> + <div class="col s12"> + <div class="card-panel z-depth-0"> + <span class="card-title"><i class="left material-icons">layers</i>HTR</span> + <p>In this process, nopaque converts your image data – like photos or scans – into text data. This step enables you to proceed with the computational analysis of your documents.</p> + <p class="right-align"> + <a href="https://readcoop.eu/de/transkribus/" target="_blank"> + <img src="https://readcoop.eu/wp-content/uploads/2020/02/Logo_Transkribus_web.svg" title="Logoo_Transkribus_web" alt="Logoo_Transkribus_web" style="width: 30%;"> + </a> + </p> + </div> + </div> + </div> + </div> + </div> + </div> + + <div class="col s12"> + <h2>Submit a job</h2> + <div class="card"> + <form class="nopaque-upload-form" data-progress-modal="progress-modal"> + <div class="card-content"> + {{ form.hidden_tag() }} + <div class="row"> + <div class="col s12 l4"> + {{ wtf.render_field(form.title, data_length='32', material_icon='title') }} + </div> + <div class="col s12 l8"> + {{ wtf.render_field(form.description, data_length='255', material_icon='description') }} + </div> + <div class="col s12 l5"> + {{ wtf.render_field(form.pdf, accept='application/pdf', placeholder='Choose a PDF file') }} + </div> + <div class="col s12 l4"> + {{ wtf.render_field(form.model, material_icon='language') }} + </div> + <div class="col s12 l3"> + {{ wtf.render_field(form.version, material_icon='apps') }} + </div> + <div class="col s12"> + <span class="card-title">Preprocessing</span> + </div> + <div class="col s9"> + <p>{{ form.binarization.label.text }}</p> + <p class="light">Based on a brightness threshold pixels are converted into either black or white. It is useful to reduce noise in images. (<b>longer duration</b>)</p> + </div> + <div class="col s3 right-align"> + <div class="switch"> + <label> + {{ form.binarization() }} + <span class="lever"></span> + </label> + </div> + </div> + <div class="col s12"><p> </p></div> + <div class="col s12 divider"></div> + <div class="col s12"><p> </p></div> + <div class="col s9"> + <p>Page range</p> + <p class="light"></p> + </div> + <div class="col s3 right-align"> + <div class="switch"> + <label> + <input disabled type="checkbox"> + <span class="lever"></span> + </label> + </div> + </div> + <div class="col s12"><p> </p></div> + <div class="col s12 divider"></div> + <div class="col s12"><p> </p></div> + <div class="col s9"> + <p>Page rotation</p> + <p class="light"></p> + </div> + <div class="col s3 right-align"> + <div class="switch"> + <label> + <input disabled type="checkbox"> + <span class="lever"></span> + </label> + </div> + </div> + <div class="col s12"><p> </p></div> + <div class="col s12 divider"></div> + <div class="col s12"><p> </p></div> + <div class="col s9"> + <p>Page split</p> + <p class="light"></p> + </div> + <div class="col s3 right-align"> + <div class="switch"> + <label> + <input disabled type="checkbox"> + <span class="lever"></span> + </label> + </div> + </div> + <!-- + Seperate each setting with the following + <div class="col s12"><p> </p></div> + <div class="col s12 divider"></div> + <div class="col s12"><p> </p></div> + --> + </div> + </div> + <div class="card-action right-align"> + {{ wtf.render_field(form.submit, material_icon='send') }} + </div> + </form> + </div> + </div> + </div> +</div> +{% endblock page_content %} + +{% block modals %} +{{ super() }} +<div id="progress-modal" class="modal"> + <div class="modal-content"> + <h4><i class="material-icons left">file_upload</i>Uploading files...</h4> + <div class="progress"> + <div class="determinate" style="width: 0%"></div> + </div> + </div> + <div class="modal-footer"> + <a href="#!" class="modal-close waves-effect waves-light btn red abort-request">Cancel</a> + </div> +</div> +{% endblock modals %} + +{% block scripts %} +{{ super() }} +<script> + let versionField = document.querySelector('#add-job-form-version'); + versionField.addEventListener('change', (event) => { + let url = new URL(window.location.href); + url.search = `?version=${event.target.value}`; + window.location.href = url.toString(); + }); +</script> +{% endblock scripts %} diff --git a/config.py b/config.py index c746595a014afe13b447b3fc0426bffcc5faff94..07ec1d78f0ca4866f3e58a49837a77d82e26b395 100644 --- a/config.py +++ b/config.py @@ -92,6 +92,11 @@ class Config: NOPAQUE_PROXY_FIX_X_PROTO = \ int(os.environ.get('NOPAQUE_PROXY_FIX_X_PROTO', '0')) + NOPAQUE_TRANSKRIBUS_ENABLED = \ + os.environ.get('NOPAQUE_TRANSKRIBUS_ENABLED', 'true').lower() == 'true' + NOPAQUE_READCOOP_USERNAME = os.environ.get('NOPAQUE_READCOOP_USERNAME') + NOPAQUE_READCOOP_PASSWORD = os.environ.get('NOPAQUE_READCOOP_PASSWORD') + @classmethod def init_app(cls, app: Flask): # Set up logging according to the corresponding (NOPAQUE_LOG_*) diff --git a/docker-compose.traefik.yml b/docker-compose.traefik.yml index a96b3b1acc7a15dc12421398caf8cffb2eb0bec3..d0f7f4d338e9cb0ba2825d60b0438ac03010e8ea 100644 --- a/docker-compose.traefik.yml +++ b/docker-compose.traefik.yml @@ -18,13 +18,15 @@ services: - "traefik.http.middlewares.http-nopaque-headers.headers.customrequestheaders.X-Forwarded-Proto=http" - "traefik.http.routers.http-nopaque.entrypoints=http" - "traefik.http.routers.http-nopaque.middlewares=http-nopaque-headers, redirect-to-https@file" - - "traefik.http.routers.http-nopaque.rule=Host(`${SERVER_NAME}`)" + # Replace <nopaque-domain> with your domain + - "traefik.http.routers.http-nopaque.rule=Host(`<nopaque-domain>`)" ### </http> ### ### <https> ### - "traefik.http.middlewares.https-nopaque-headers.headers.customrequestheaders.X-Forwarded-Proto=https" - "traefik.http.routers.https-nopaque.entrypoints=https" - "traefik.http.routers.https-nopaque.middlewares=hsts-header@file, https-nopaque-headers" - - "traefik.http.routers.https-nopaque.rule=Host(`${SERVER_NAME}`)" + # Replace <nopaque-domain> with your domain + - "traefik.http.routers.https-nopaque.rule=Host(`<nopaque-domain>`)" - "traefik.http.routers.https-nopaque.tls.certresolver=<CERTRESOLVER>" - "traefik.http.routers.https-nopaque.tls.options=intermediate@file" ### </https> ### diff --git a/migrations/versions/097aae1f02d7_.py b/migrations/versions/aa855b80cf1d_.py similarity index 97% rename from migrations/versions/097aae1f02d7_.py rename to migrations/versions/aa855b80cf1d_.py index ccac6756c2f6a8ef1f90f1fac0e279bf8b44ab66..687c89a8ddc9214931d07e9986bebe333f6250e9 100644 --- a/migrations/versions/097aae1f02d7_.py +++ b/migrations/versions/aa855b80cf1d_.py @@ -1,8 +1,8 @@ """empty message -Revision ID: 097aae1f02d7 +Revision ID: aa855b80cf1d Revises: -Create Date: 2022-02-08 10:02:03.748588 +Create Date: 2022-04-01 12:14:42.606685 """ from alembic import op @@ -10,7 +10,7 @@ import sqlalchemy as sa # revision identifiers, used by Alembic. -revision = '097aae1f02d7' +revision = 'aa855b80cf1d' down_revision = None branch_labels = None depends_on = None @@ -56,7 +56,6 @@ def upgrade(): sa.Column('title', sa.String(length=32), nullable=True), sa.Column('num_analysis_sessions', sa.Integer(), nullable=True), sa.Column('num_tokens', sa.Integer(), nullable=True), - sa.Column('archive_file', sa.String(length=255), nullable=True), sa.ForeignKeyConstraint(['user_id'], ['users.id'], ), sa.PrimaryKeyConstraint('id') ) @@ -85,6 +84,7 @@ def upgrade(): sa.Column('description', sa.String(length=255), nullable=True), sa.Column('publisher', sa.String(length=128), nullable=True), sa.Column('publishing_year', sa.Integer(), nullable=True), + sa.Column('shared', sa.Boolean(), nullable=True), sa.Column('title', sa.String(length=64), nullable=True), sa.Column('version', sa.String(length=16), nullable=True), sa.ForeignKeyConstraint(['user_id'], ['users.id'], ), diff --git a/requirements.txt b/requirements.txt index ce5a4bfbf0a7c482bb2309b62902021761af55f0..038611ce6cdec97ae30c1728dee75b88b0f7d96d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ cqi docker eventlet==0.30.2 Flask==1.1.4 +Flask-APScheduler Flask-Assets Flask-Hashids Flask-HTTPAuth @@ -16,6 +17,7 @@ Flask-WTF gunicorn hiredis jsonschema +MarkupSafe==2.0.1 psycopg2 pyScss python-dotenv