diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..837d45d --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +.local/ + +leantime/ + + +.vscode/ + +.DS_Store \ No newline at end of file diff --git a/Readme.md b/Readme.md new file mode 100644 index 0000000..aeac49c --- /dev/null +++ b/Readme.md @@ -0,0 +1,35 @@ + ██████╗ ███████╗███╗ ██╗██╗██╗ ██╗ ██████╗ ██████╗ ███████╗ ██████╗ + ██╔════╝ ██╔════╝████╗ ██║██║██║ ██║██╔════╝ ██╔════╝ ██╔════╝██╔═══██╗ + ██║ ███╗█████╗ ██╔██╗ ██║██║██║ ██║╚█████╗ ██║ █████╗ ██║ ██║ + ██║ ██║██╔══╝ ██║╚██╗██║██║██║ ██║ ╚═══██╗ ██║ ██╔══╝ ██║ ██║ + ╚██████╔╝███████╗██║ ╚████║██║╚█████╔╝██████╔╝ ╚██████╗ ███████╗╚██████╔╝ + ╚═════╝ ╚══════╝╚═╝ ╚═══╝╚═╝ ╚════╝ ╚═════╝ ██ ╚═════╝ ╚══════╝ ╚═════╝ +--- + +# Genius.ceo repository + +Ceph Dashboard is now available at: + + URL: https://manager-node-1:8443/ + User: admin + Password: g0uhtgv520 + +Enabling client.admin keyring and conf on hosts with "admin" label +Saving cluster configuration to /var/lib/ceph/6fb9d55b-5b20-11f0-9be6-920006043bcc/config directory +You can access the Ceph CLI as following in case of multi-cluster or non-default config: + + sudo /usr/sbin/cephadm shell --fsid 6fb9d55b-5b20-11f0-9be6-920006043bcc -c /etc/ceph/ceph.conf -k /etc/ceph/ceph.client.admin.keyring + +Or, if you are only running a single cluster on this host: + + sudo /usr/sbin/cephadm shell + +Please consider enabling telemetry to help improve Ceph: + + ceph telemetry on + +For more information see: + + https://docs.ceph.com/en/latest/mgr/telemetry/ + +Bootstrap complete. \ No newline at end of file diff --git a/iac/ansible/01-infra.playbook.yml b/iac/ansible/01-infra.playbook.yml new file mode 100644 index 0000000..5927c3c --- /dev/null +++ b/iac/ansible/01-infra.playbook.yml @@ -0,0 +1,33 @@ +- name: Nodes initialisieren und härten + hosts: all + become: true + + roles: + - role: common + tags: common + - role: ssh_hardening + tags: ssh + - role: ufw_firewall + tags: firewall + - role: fail2ban + tags: fail2ban + handlers: + - name: restart sshd + ansible.builtin.service: + name: ssh + state: restarted + - name: restart fail2ban + ansible.builtin.service: + name: fail2ban + state: restarted +- name: Setup Ceph Cluster and CephFS + hosts: all + become: true + roles: + - role: ceph_setup + +- name: Docker Swarm initialisieren + hosts: all + become: true + roles: + - role: docker_swarm \ No newline at end of file diff --git a/iac/ansible/02-plattform.playbook.yml b/iac/ansible/02-plattform.playbook.yml new file mode 100644 index 0000000..73bf203 --- /dev/null +++ b/iac/ansible/02-plattform.playbook.yml @@ -0,0 +1,9 @@ +- name: Infrastruktur Dienste bereitstellen + hosts: all + gather_facts: true + roles: + - traefik + - authentik + - portainer + - leantime + - kestra \ No newline at end of file diff --git a/iac/ansible/assets/20auto-upgrades b/iac/ansible/assets/20auto-upgrades new file mode 100644 index 0000000..7587020 --- /dev/null +++ b/iac/ansible/assets/20auto-upgrades @@ -0,0 +1,4 @@ +APT::Periodic::Update-Package-Lists "1"; +APT::Periodic::Download-Upgradeable-Packages "1"; +APT::Periodic::AutocleanInterval "7"; +APT::Periodic::Unattended-Upgrade "1"; \ No newline at end of file diff --git a/iac/ansible/assets/50unattended-upgrades b/iac/ansible/assets/50unattended-upgrades new file mode 100644 index 0000000..76dd90b --- /dev/null +++ b/iac/ansible/assets/50unattended-upgrades @@ -0,0 +1,9 @@ +Unattended-Upgrade::Allowed-Origins { + "${distro_id}:${distro_codename}-security"; +}; +Unattended-Upgrade::Package-Blacklist { +}; +Unattended-Upgrade::DevRelease "false"; +Unattended-Upgrade::Remove-Unused-Kernel-Packages "true"; +Unattended-Upgrade::Remove-Unused-Dependencies "true"; +Unattended-Upgrade::Automatic-Reboot "false"; \ No newline at end of file diff --git a/iac/ansible/group_vars/all.yml b/iac/ansible/group_vars/all.yml new file mode 100644 index 0000000..925913e --- /dev/null +++ b/iac/ansible/group_vars/all.yml @@ -0,0 +1,12 @@ +admin_user: 'admin' +ssh_port: 22 +cephfs_name: "shared-fs" +ceph_osd_device: "/dev/sdb" +public_interface: 'eth0' +private_interface: 'enp7s0' +authorized_keys: + - 'ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKtYTptTN9ggoy0aUKXmxaPKpclEn86jM7s5UtTw1JJI' # Marcels MacBook +main_domain: genius.ceo + +ceph_volume: /mnt/cephfs +traefik_public_net: traefik_public \ No newline at end of file diff --git a/iac/ansible/group_vars/secrets.yml b/iac/ansible/group_vars/secrets.yml new file mode 100644 index 0000000..f940b11 --- /dev/null +++ b/iac/ansible/group_vars/secrets.yml @@ -0,0 +1,59 @@ +$ANSIBLE_VAULT;1.1;AES256 +34613362353964313436306439386661613364663666653265313937343239633365663836653030 +6262386661666364383961336461316139333262623034340a643434316632336132613264646437 +36376365613061353866383135353432303433353931633063313566613166303064316666613132 +3733623536643935370a656431646435626265666265666230356162656363663838636662313466 +61336237306332643032653766313036636163336431613236663864636438363832383231323362 +62666463336639303766356331353635323031636465616235663738333761653934346663386636 +63623361363164663663313966653939643462353638613464396466613931363662623763326535 +34376237353663656636363866373466346434666339646131396439653261373738636665613435 +65356330303863303236373933333163633964633061393136646632386137346434353365343763 +30343937656166303962653030366566616331666262343336343138623566353832313836643435 +62636333346235316562303061656166383135633464623734626336623565346336626134333933 +35363363376663333061663164623539363731613263376163306436636265336562396439356137 +30663431373131303437393166396539306133636264653733303762316363386438643536306338 +32303139303363316264393939326561393730396664343361393863303736343933636265633439 +65633765666362396439643863653531363366383866373939616333353430633530343262366138 +31663863663165653932653733623761613265383039336633383832393761666337336165613933 +63383934366662353038626539633132313939376231643133363739303235326433353733363437 +35626233613936626532326262646166363739666162353237323237383132333134343439336134 +33613462393237626432386462373439303439356666336630363536366233346438313039346530 +33393232333633663731393466653439623638316565346530306439326431323436356166633334 +66383034643834613133333265646338303463393035393266653832366434313636633730636436 +38353337633437656262623061666563646637626363353561323231376237623264373861376666 +66363265633638356133353933613664353934373634613662326437336562663766306364303538 +35623130616265623838353838396235386661666132623163383162373665313462663738303933 +63363764653561616162386139646130393439373066666437623236383238396233653165623032 +34316439376331356539626464313462616238623166623761626435303565653233386236656262 +62613935336661623862323833353265366533643830373634663266666332333463303666343366 +39653332346433306566316430656361363230343761613263393230366362363132663565636264 +65313633653464663963373561373532636235353331353237623635613034613337343730656632 +31656165666134333864353730363163623365393030333932393565666235643639303662663532 +38343734393135643039633664653966313536616533656635373535636434396333313536623536 +39623132326362656166366566373163386363336231633233353639313166333932656133363365 +66666665346331613638656562396463386637356539366539343232353061666531353166396536 +39623762633064323332653831643832303332396431633738396266633935656132323164613161 +61353663383532613763356630373063383161376165333736316466353231656534366636313636 +37616636383163616136643630363535346137636636633432643337393865393063626663333164 +36656537343231386333323637386539386364356266376433616636313239376666353066306363 +39376461323062393935613630656230346131373634363633393035346263663762623063356633 +36646664623230303761373138333164303363373365386266386138653764623030623630333631 +66363866633064656532336137613964653431663436333761666631656339646161636435343065 +37646164653937633962386631373236653064346438323664383933643738656536356562626532 +34663834363230303164626236393938643037363036613965373330636238633661346335336531 +62663461626365386362393061626266303463663735303539383937363965383234666337386165 +30366564363766623162306666656566353662633866396430396633623266383332303339666663 +38313536666336323366616432336161656434646463373963356331326364333038366337386638 +39396535386331663466323334613533383439343437363631363532313362663564353635343735 +37653063383163316366366335663537653134326564643062653065303337303333643961383837 +39393734326562616165313133643766303934336263326433366436623539633233643761616436 +33356234313538343635343630623337343436346638396539316131623861353630333964633839 +33316565326164386337623730623932313363306436316335336238333430626165663232343463 +36653038633632616335393262656638346434386639383131396233643932323931393264613134 +30336134343464373265636234656561653462356435383138323638613039623839373935326462 +32393430616438356332313766353337383035623137363233323664393833303464313162303833 +65383131313335353832343963636639346162353634306430353638393136623734623833306136 +32396130623065326636633235346630336435663261353866323862666231656261333839373162 +35623835663434356438653533623337363531353634663064303035633839656463656238636132 +66316333356633613130323438376530623634336632323365616239373865623334363635396331 +3263616336653336636666386632316564613331323431363935 diff --git a/iac/ansible/inventory.ini b/iac/ansible/inventory.ini new file mode 100644 index 0000000..d50f564 --- /dev/null +++ b/iac/ansible/inventory.ini @@ -0,0 +1,10 @@ +[all:children] +managers +workers + +[managers] +manager-node-1 ansible_host=37.27.215.220 ansible_connection=ssh ansible_user=root ansible_ssh_private_key_file=./.local/secure/private_key +manager-node-2 ansible_host=135.181.146.55 ansible_connection=ssh ansible_user=root ansible_ssh_private_key_file=./.local/secure/private_key +manager-node-3 ansible_host=65.109.135.85 ansible_connection=ssh ansible_user=root ansible_ssh_private_key_file=./.local/secure/private_key + +[workers] \ No newline at end of file diff --git a/iac/ansible/playbook.yml b/iac/ansible/playbook.yml new file mode 100644 index 0000000..5300ade --- /dev/null +++ b/iac/ansible/playbook.yml @@ -0,0 +1,10 @@ +--- +- name: Main-Playbook + hosts: all + gather_facts: true + roles: + # - traefik + # - portainer + # - kestra + - gitea + diff --git a/iac/ansible/resources/dockge/dockge.yml b/iac/ansible/resources/dockge/dockge.yml new file mode 100644 index 0000000..8048892 --- /dev/null +++ b/iac/ansible/resources/dockge/dockge.yml @@ -0,0 +1,33 @@ +networks: + traefik_public: + external: true + +services: + dockge: + image: louislam/dockge:1 + environment: + - DOCKGE_STACKS_DIR=/opt/stacks + - DOCKGE_DATA_DIR=/app/data + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - '/mnt/cephfs/dockge/data:/app/data' + - '/mnt/cephfs/dockge/stacks:/opt/stacks' + networks: + - traefik_public + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.role == manager + restart_policy: + condition: on-failure + labels: + - 'traefik.enable=true' + - 'traefik.swarm.network=traefik_public' + # --- Router für Dockge --- + - 'traefik.http.routers.dockge.rule=Host(`dockge.genius.ceo`)' + - 'traefik.http.routers.dockge.entrypoints=https' + - 'traefik.http.routers.dockge.tls.certresolver=main' + # --- Service für Dockge --- + - 'traefik.http.services.dockge.loadbalancer.server.port=5001' diff --git a/iac/ansible/resources/monitoring/base.yml b/iac/ansible/resources/monitoring/base.yml new file mode 100644 index 0000000..17d65a7 --- /dev/null +++ b/iac/ansible/resources/monitoring/base.yml @@ -0,0 +1,22 @@ + +services: + sd_server: + image: socheatsok78/dockerswarm_sd_server:latest + networks: + - sd_network + volumes: + - /var/run/docker.sock:/var/run/docker.sock + +networks: + monitoring: + driver: overlay + attachable: true + ipam: + config: + - subnet: 172.16.201.0/24 + sd_network: + driver: overlay + attachable: true + ipam: + config: + - subnet: 172.16.202.0/24 \ No newline at end of file diff --git a/iac/ansible/resources/monitoring/config/alloy.v3.alloy b/iac/ansible/resources/monitoring/config/alloy.v3.alloy new file mode 100644 index 0000000..6cfd7d9 --- /dev/null +++ b/iac/ansible/resources/monitoring/config/alloy.v3.alloy @@ -0,0 +1,139 @@ +faro.receiver "stage_app_agent_receiver" { + server { + listen_address = "0.0.0.0" + listen_port = 12347 + cors_allowed_origins = ["*"] + // cors_allowed_origins = ["https://avicenna.genius.ceo"] + api_key = "t3stK3y" + max_allowed_payload_size = "10MiB" + + rate_limiting { + rate = 100 + } + } + + sourcemaps {} + + output { + logs = [loki.process.logs_process_client.receiver] + traces = [otelcol.exporter.otlp.tempo.input] + } +} + +loki.process "logs_process_client" { + forward_to = [loki.write.to_loki.receiver] + + stage.logfmt { + mapping = { "kind" = "", "service_name" = "", "app_name" = "", "namespace" = "" } + } + + stage.labels { + values = { "kind" = "kind", "service_name" = "service_name", "app" = "app_name", "namespace" = "namespace" } + } +} + +otelcol.receiver.otlp "otel_collector" { + grpc { + endpoint = "0.0.0.0:4317" + } + http { + endpoint = "0.0.0.0:4318" + cors { + allowed_origins = ["https://avicenna.genius.ceo/"] + } + } + + // Definiert, wohin die empfangenen Daten weitergeleitet werden + output { + metrics = [otelcol.exporter.prometheus.otel_metrics.input] + logs = [otelcol.exporter.loki.otel_logs.input] + traces = [otelcol.exporter.otlp.tempo.input] + } +} + +loki.write "to_loki" { + endpoint { + url = "http://loki:3100/loki/api/v1/push" + } +} + +prometheus.remote_write "to_prometheus" { + endpoint { + url = "http://prometheus:9090/api/v1/write" + } +} + +// Docker-Container auf dem Host entdecken +discovery.docker "logs_integration_docker" { + host = "unix:///var/run/docker.sock" + refresh_interval = "5s" +} +discovery.relabel "logs_integration_docker" { + targets = [] + + rule { + action = "labelmap" + regex = "__meta_docker_container_label_com_docker_swarm_node_id" + replacement = "node_id" + } + + rule { + action = "labelmap" + regex = "__meta_docker_container_label_com_docker_stack_namespace" + replacement = "namespace" + } + + rule { + action = "labelmap" + regex = "__meta_docker_container_label_com_docker_swarm_service_name" + replacement = "service_name" + } + + rule { + action = "labelmap" + regex = "__meta_docker_container_name" + replacement = "container_name" + } +} + +loki.source.docker "logs_from_containers" { + host = "unix:///var/run/docker.sock" + targets = discovery.docker.logs_integration_docker.targets // Nutzt die entdeckten Container + + relabel_rules = discovery.relabel.logs_integration_docker.rules + + // Leitet die gesammelten Logs an den definierten Loki-Endpunkt weiter + forward_to = [loki.write.to_loki.receiver] +} + +otelcol.exporter.otlp "tempo" { // Name kann variieren + client { + endpoint = "tempo:4317" // Ziel: Tempo Service auf Port 4317 + tls { + insecure = true // Interne Kommunikation ohne TLS + } + } +} + +otelcol.exporter.prometheus "otel_metrics" { + forward_to = [prometheus.remote_write.to_prometheus.receiver] +} + +otelcol.exporter.loki "otel_logs" { + forward_to = [loki.write.to_loki.receiver] +} + +// Logging für Alloy selbst konfigurieren +logging { + level = "info" + format = "logfmt" +} + +// prometheus.scrape "alloy_self" { +// targets = [ +// prometheus.target_group { +// targets = [{"__address__" = "localhost:12345"}] +// } +// ] +// forward_to = [...] // An Prometheus Remote Write oder lokalen Agent +// } \ No newline at end of file diff --git a/iac/ansible/resources/monitoring/config/loki.v1.yml b/iac/ansible/resources/monitoring/config/loki.v1.yml new file mode 100644 index 0000000..2c57a8f --- /dev/null +++ b/iac/ansible/resources/monitoring/config/loki.v1.yml @@ -0,0 +1,76 @@ +auth_enabled: false # Einfachste Konfiguration ohne Authentifizierung +analytics: + reporting_enabled: false +server: + http_listen_port: 3100 + grpc_listen_port: 9096 # Standard gRPC Port für Loki + +common: + instance_addr: 127.0.0.1 # Adresse, unter der sich die Instanz meldet + path_prefix: /loki # Wo Loki seine Daten speichert (im Volume) + storage: + filesystem: # Lokales Dateisystem für Indizes und Chunks + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 # Keine Replikation bei Einzelinstanz + ring: + kvstore: + store: inmemory # Einfachster Ring-Speicher für Einzelinstanz + +query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 100 + +schema_config: + configs: + - from: 2020-10-24 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +pattern_ingester: + enabled: true + metric_aggregation: + loki_address: localhost:3100 + +frontend: + encoding: protobuf + +limits_config: + metric_aggregation_enabled: true + reject_old_samples: true + reject_old_samples_max_age: 168h # 7 Tage + ingestion_rate_mb: 15 # Erlaube 15 MiB/Sekunde pro Tenant (Standard war 4) + ingestion_burst_size_mb: 30 # Erlaube kurzfristige Bursts bis 30 MiB (Standard war 6) + # Optional: Maximale Anzahl aktiver Log-Streams pro Tenant (Standard ist 10000) + # max_global_streams_per_user: 10000 + # Optional: Maximale Größe einer Log-Zeile (Standard 256kB) + # max_line_size: 262144 + +# --- Optional: Compactor (Bereinigt alte Daten) --- +# compactor: +# working_directory: /loki/compactor +# shared_store: filesystem +# compaction_interval: 10m +# retention_enabled: true +# retention_delete_delay: 2h +# retention_delete_worker_count: 150 + +# --- Optional: Ruler (für Alerts basierend auf Logs) --- +# ruler: +# alertmanager_url: http://alertmanager:9093 # Pfad zu deinem Alertmanager +# storage: +# type: local +# local: +# directory: /loki/rules +# rule_path: /tmp/loki/rules-temp +# ring: +# kvstore: +# store: inmemory +# enable_api: true \ No newline at end of file diff --git a/iac/ansible/resources/monitoring/config/prometheus.v3.yml b/iac/ansible/resources/monitoring/config/prometheus.v3.yml new file mode 100644 index 0000000..d1e2aff --- /dev/null +++ b/iac/ansible/resources/monitoring/config/prometheus.v3.yml @@ -0,0 +1,57 @@ +global: + scrape_interval: 15s # Wie oft Ziele abgefragt werden + evaluation_interval: 15s # Wie oft Regeln ausgewertet werden + +scrape_configs: + - job_name: 'prometheus' + # Prometheus überwacht sich selbst + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'node-exporter' + # Docker Swarm Service Discovery für den Node Exporter + dockerswarm_sd_configs: + - host: unix:///var/run/docker.sock + role: tasks + port: 9100 # Standard-Port vom Node Exporter + relabel_configs: + # Nur Tasks im Zustand 'running' verwenden + - source_labels: [__meta_dockerswarm_task_desired_state] + regex: running + action: keep + # Nur Tasks des 'node-exporter' Services aus diesem Stack auswählen + # Passe den Regex ggf. an, wenn dein Stack anders heißt (hier Annahme: Stack-Name enthält 'monitoring') + - source_labels: [__meta_dockerswarm_service_name] + regex: ^monitoring_node-exporter$ # Regex an Stack-Namen anpassen! + action: keep + # Verwende den Hostnamen des Swarm Nodes als Instance Label + - source_labels: [__meta_dockerswarm_node_hostname] + target_label: instance + # Setze die Zieladresse korrekt auf IP:Port + - source_labels: [__address__] + regex: '(.*):.*' # Extrahiere die IP-Adresse + replacement: '${1}:9100' # Setze den korrekten Port (9100) + target_label: __address__ + + - job_name: 'cadvisor' + dockerswarm_sd_configs: + - host: unix:///var/run/docker.sock + role: tasks + port: 8080 # Standard-Port von cAdvisor + relabel_configs: + # Nur Tasks im Zustand 'running' verwenden + - source_labels: [__meta_dockerswarm_task_desired_state] + regex: running + action: keep + # Nur Tasks des 'cadvisor' Services aus diesem Stack auswählen + # Passe den Regex an deinen Stack-Namen an! + - source_labels: [__meta_dockerswarm_service_name] + regex: .*(monitoring|mon)_cadvisor.* # Regex an Stack-Namen anpassen! + action: keep + # Verwende den Hostnamen des Swarm Nodes als Instance Label + - source_labels: [__meta_dockerswarm_node_hostname] + target_label: instance + # WICHTIG: Setze den Metrik-Pfad, da cAdvisor ihn unter /metrics bereitstellt + - action: replace + target_label: __metrics_path__ + replacement: /metrics \ No newline at end of file diff --git a/iac/ansible/resources/monitoring/config/promtail.v1.yml b/iac/ansible/resources/monitoring/config/promtail.v1.yml new file mode 100644 index 0000000..b9b14a9 --- /dev/null +++ b/iac/ansible/resources/monitoring/config/promtail.v1.yml @@ -0,0 +1,38 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /mnt/promtail/positions.yaml # Pfad im gemounteten Volume + +clients: + - url: http://loki:3100/loki/api/v1/push # Sendet Logs an den Loki-Service + +scrape_configs: + - job_name: docker_containers + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + relabel_configs: + # Extrahiere Container-Name (ohne '/') + - source_labels: ['__meta_docker_container_name'] + regex: '/(.*)' + target_label: 'container_name' + # Behalte den Log-Stream (stdout/stderr) als Label + - source_labels: ['__meta_docker_container_log_stream'] + target_label: 'logstream' + # Extrahiere Service-Name aus Swarm-Label + - source_labels: ['__meta_docker_container_label_com_docker_swarm_service_name'] + target_label: 'service_name' + # Extrahiere Task-Name aus Swarm-Label + - source_labels: ['__meta_docker_container_label_com_docker_swarm_task_name'] + target_label: 'task_name' + # Füge 'instance'-Label mit dem Hostnamen des Tasks hinzu (Annäherung an Node-Namen) + - action: replace + source_labels: ['container_name'] # Braucht ein existierendes Label als Quelle + target_label: 'instance' + replacement: ${HOSTNAME} # Nutzt Swarm HOSTNAME Variable + # Verwerfe Logs von Promtail selbst (Regex ggf. an Stacknamen anpassen) + - source_labels: ['container_name'] + regex: 'monitoring_promtail.*' # Passe 'monitoring' an deinen Stack-Namen an! + action: drop \ No newline at end of file diff --git a/iac/ansible/resources/monitoring/config/tempo.v1.yml b/iac/ansible/resources/monitoring/config/tempo.v1.yml new file mode 100644 index 0000000..35c4a0e --- /dev/null +++ b/iac/ansible/resources/monitoring/config/tempo.v1.yml @@ -0,0 +1,36 @@ +server: + http_listen_port: 3200 # Standard API/UI Port + +distributor: + receivers: # OTLP receiver aktivieren (Tempo kann auch direkt empfangen) + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +# Grundlegende Konfiguration für Datenverarbeitung (meist ok für Start) +ingester: + trace_idle_period: 10s + max_block_bytes: 1048576 # 1MB + max_block_duration: 5m + +compactor: + compaction: + block_retention: 1h # Wie lange Blöcke mindestens aufheben (geringer Wert für Test) + +# WICHTIG: Storage explizit definieren! +storage: + trace: + backend: local # Backend-Typ: lokales Dateisystem + # Write Ahead Log (WAL) configuration. + wal: + path: /tmp/tempo/wal # Directory to store the the WAL locally. + # Local configuration for filesystem storage. + local: + path: /tmp/tempo/blocks # Directory to store the TSDB blocks. + # Pool used for finding trace IDs. + pool: + max_workers: 100 # Worker pool determines the number of parallel requests to the object store backend. + queue_depth: 10000 # Maximum depth for the querier queue jobs. A job is required for each block searched. \ No newline at end of file diff --git a/iac/ansible/resources/monitoring/observability.yml b/iac/ansible/resources/monitoring/observability.yml new file mode 100644 index 0000000..ead9344 --- /dev/null +++ b/iac/ansible/resources/monitoring/observability.yml @@ -0,0 +1,226 @@ +configs: + alloy-config-v3: + file: /srv/monitoring/config/alloy.v3.alloy + loki-config-v1: + file: /srv/monitoring/config/loki.v1.yml + prometheus-config-v3: + file: /srv/monitoring/config/prometheus.v3.yml + tempo-config-v1: + file: /srv/monitoring/config/tempo.v1.yml + +volumes: + prometheus-data: + driver: local + grafana-data: + driver: local + loki-data: + driver: local + alloy-data: + driver: local + tempo-data: + driver: local + +networks: + monitoring-net: # Internes Overlay-Netzwerk für die Monitoring-Komponenten + driver: overlay + attachable: true # Erlaubt anderen Containern/Stacks ggf. den Zugriff + traefik_public: # Das externe Netzwerk, auf dem Traefik lauscht + external: true # Wichtig: Dieses Netzwerk wird NICHT von diesem Stack erstellt + +services: + prometheus: + image: prom/prometheus:latest + user: "65534:988" + volumes: + - prometheus-data:/prometheus + - /var/run/docker.sock:/var/run/docker.sock:ro + configs: + - source: prometheus-config-v3 # Versionierte Config + target: /etc/prometheus/prometheus.yml + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + - '--web.enable-lifecycle' + - '--web.enable-remote-write-receiver' + networks: + - monitoring-net + - traefik_public # Nur wenn Traefik direkt auf Prometheus zugreifen soll (optional) + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.role == manager # Optional: An Manager-Nodes binden + labels: + - "traefik.enable=true" + # --- Router für Prometheus UI --- + - "traefik.http.routers.prometheus.rule=Host(`prometheus.genius.ceo`)" + - "traefik.http.routers.prometheus.entrypoints=https" # Entrypoint anpassen, falls anders + - "traefik.http.routers.prometheus.tls.certresolver=main" # CertResolver anpassen! + # --- Service für Prometheus UI --- + - "traefik.http.services.prometheus.loadbalancer.server.port=9090" + # --- Middleware (optional, z.B. für Authentifizierung) --- + # - "traefik.http.routers.prometheus.middlewares=my-auth-middleware" + # --- Netzwerk für Traefik --- + # WICHTIG: Das Netzwerk muss existieren und Traefik muss darauf lauschen. + - "traefik.swarm.network=traefik_public" # Traefik Netzwerkname anpassen! + + loki: + image: grafana/loki:latest + volumes: + - loki-data:/loki + configs: + - source: loki-config-v1 + target: /etc/loki/local-config.yaml + command: "-config.file=/etc/loki/local-config.yaml" + networks: + - monitoring-net + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.role == manager + + tempo: + image: grafana/tempo:latest # Aktuelles Tempo Image + volumes: + - tempo-data:/tmp/tempo # Persistenter Speicher für Traces (Standardpfad) + configs: + - source: tempo-config-v1 + target: /etc/tempo/tempo.yaml + command: [ "-config.file=/etc/tempo/tempo.yaml" ] + user: root + # Tempo lauscht intern auf verschiedenen Ports für verschiedene Protokolle: + # - 4317 (OTLP gRPC - wird von Alloy genutzt) + # - 4318 (OTLP HTTP) + # - 14268 (Jaeger gRPC) + # - 3200 (Tempo HTTP Frontend/API - für Grafana & UI) + # Wir mappen sie vorerst nicht nach außen. + networks: + - monitoring-net + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.role == manager # Optional: An Manager-Nodes binden + + grafana: + image: grafana/grafana:latest + volumes: + - grafana-data:/var/lib/grafana + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin # Besser über Docker Secrets lösen! + # Weitere Grafana env vars nach Bedarf + networks: + - monitoring-net + - traefik_public # Nur wenn Traefik direkt auf Grafana zugreifen soll (optional) + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.role == manager # Optional: An Manager-Nodes binden + labels: + - "traefik.enable=true" + # --- Router für Grafana --- + - "traefik.http.routers.grafana.rule=Host(`grafana.genius.ceo`)" + - "traefik.http.routers.grafana.entrypoints=https" # Entrypoint anpassen, falls anders + - "traefik.http.routers.grafana.tls.certresolver=main" # CertResolver anpassen! + # --- Service für Grafana --- + - "traefik.http.services.grafana.loadbalancer.server.port=3000" + # --- Middleware (optional) --- + # - "traefik.http.routers.grafana.middlewares=my-auth-middleware" + # --- Netzwerk für Traefik --- + - "traefik.swarm.network=traefik_public" # Traefik Netzwerkname anpassen! + + alloy: + image: grafana/alloy:latest # Offizielles Alloy Image + volumes: + - alloy-data:/var/lib/alloy/data # Persistenter Speicher für Alloy (WAL etc.) + - /var/run/docker.sock:/var/run/docker.sock:ro # Für Docker Discovery + configs: + - source: alloy-config-v3 + target: /etc/alloy/config.alloy # S3-Pfad für Alloy Config + environment: + - HOSTNAME=${HOSTNAME} + # Start mit root wegen Docker Socket / Volume Permissions, kann später optimiert werden (Socket Proxy) + # user: root + command: [ + "run", + "--server.http.listen-addr=0.0.0.0:12345", + "/etc/alloy/config.alloy", + ] + networks: + - monitoring-net + - traefik_public + deploy: + mode: global # WICHTIG: Alloy muss auf jedem Node laufen! + labels: # Traefik Labels für Alloy UI + - "traefik.enable=true" + # --- Router für Alloy UI --- + - "traefik.http.routers.alloy-ui.rule=Host(`otlp.genius.ceo`)" + - "traefik.http.routers.alloy-ui.entrypoints=https" + - "traefik.http.routers.alloy-ui.tls.certresolver=main" + - "traefik.http.routers.alloy-ui.service=alloy-ui@swarm" + # --- Service für Alloy UI --- + - "traefik.http.services.alloy-ui.loadbalancer.server.port=12345" # Ziel-Port ist 12345 (Alloy UI Standard) + # # --- Router für OTLP HTTP --- + # - "traefik.http.routers.otlp-http.rule=Host(`alloy.genius.ceo`)" + # - "traefik.http.routers.otlp-http.entrypoints=https" + # - "traefik.http.routers.otlp-http.tls.certresolver=main" + # - "traefik.http.routers.otlp-http.service=otlp-http@swarm" + # # --- Service für OTLP HTTP --- + # - "traefik.http.services.otlp-http.loadbalancer.server.port=4318" # Ziel-Port ist 4318 (OTLP HTTP Standard) + # --- Router für FARO RECEIVER --- + - "traefik.http.routers.faro-receiver.rule=Host(`alloy.genius.ceo`)" + - "traefik.http.routers.faro-receiver.entrypoints=https" + - "traefik.http.routers.faro-receiver.tls.certresolver=main" + - "traefik.http.routers.faro-receiver.service=faro-receiver@swarm" + # --- Service für FARO RECEIVER --- + - "traefik.http.services.faro-receiver.loadbalancer.server.port=12347" # Ziel-Port ist 12347 (FARO RECEIVER Standard) + # # --- Middlewares --- + # - "traefik.http.routers.otlp-http.middlewares=alloy-ratelimit@swarm" + # - "traefik.http.middlewares.alloy-ratelimit.ratelimit.average=100" # z.B. 100 Anfragen pro Sekunde + # - "traefik.http.middlewares.alloy-ratelimit.ratelimit.burst=50" # kurzfristig 50 mehr erlaubt + # --- Netzwerk für Traefik --- + - "traefik.swarm.network=traefik_public" # Traefik Netzwerkname prüfen/anpassen! + + node-exporter: + image: quay.io/prometheus/node-exporter:latest # Aktuelles Image verwenden + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - '--path.procfs=/host/proc' + - '--path.sysfs=/host/sys' + - '--path.rootfs=/rootfs' + - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)' + networks: + - monitoring-net # Nur internes Netzwerk nötig + deploy: + mode: global # Läuft auf JEDEM Node im Swarm + + cadvisor: + image: gcr.io/cadvisor/cadvisor:latest # Google's cAdvisor Image + volumes: + # cAdvisor braucht Zugriff auf Host-System-Infos und Docker + - /:/rootfs:ro + - /var/run:/var/run:ro + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + networks: + - monitoring-net # Nur internes Netzwerk nötig + deploy: + mode: global # Läuft auf JEDEM Node im Swarm + resources: # Optional: Limitiert Ressourcen, cAdvisor kann hungrig sein + limits: + memory: 512M + reservations: + memory: 256M \ No newline at end of file diff --git a/iac/ansible/resources/signoz-infra/config/otel-agent-config.v1.yaml b/iac/ansible/resources/signoz-infra/config/otel-agent-config.v1.yaml new file mode 100644 index 0000000..8b187f2 --- /dev/null +++ b/iac/ansible/resources/signoz-infra/config/otel-agent-config.v1.yaml @@ -0,0 +1,102 @@ +receivers: + hostmetrics: + collection_interval: 30s + root_path: /hostfs + scrapers: + cpu: {} + load: {} + memory: {} + disk: {} + filesystem: {} + network: {} + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + prometheus: + config: + global: + scrape_interval: 60s + scrape_configs: + - job_name: otel-agent + static_configs: + - targets: + - localhost:8888 + labels: + job_name: otel-agent + tcplog/docker: + listen_address: "0.0.0.0:2255" + operators: + - type: regex_parser + regex: '^<([0-9]+)>[0-9]+ (?P[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+)?([zZ]|([\+-])([01]\d|2[0-3]):?([0-5]\d)?)?) (?P\S+) (?P\S+) [0-9]+ - -( (?P.*))?' + timestamp: + parse_from: attributes.timestamp + layout: '%Y-%m-%dT%H:%M:%S.%LZ' + - type: move + from: attributes["body"] + to: body + - type: remove + field: attributes.timestamp + # please remove names from below if you want to collect logs from them + - type: filter + id: signoz_logs_filter + expr: 'attributes.container_name matches "^(signoz_(logspout|signoz|otel-collector|clickhouse|zookeeper))|(infra_(logspout|otel-agent|otel-metrics)).*"' +processors: + batch: + send_batch_size: 10000 + send_batch_max_size: 11000 + timeout: 10s + resourcedetection: + # Using OTEL_RESOURCE_ATTRIBUTES envvar, env detector adds custom labels. + detectors: + # - ec2 + # - gcp + # - azure + - env + - system + timeout: 2s +extensions: + health_check: + endpoint: 0.0.0.0:13133 + pprof: + endpoint: 0.0.0.0:1777 +exporters: + otlp: + endpoint: ${env:SIGNOZ_COLLECTOR_ENDPOINT} + tls: + insecure: true + headers: + signoz-access-token: ${env:SIGNOZ_ACCESS_TOKEN} + # debug: {} +service: + telemetry: + logs: + encoding: json + metrics: + address: 0.0.0.0:8888 + extensions: + - health_check + - pprof + pipelines: + traces: + receivers: [otlp] + processors: [resourcedetection, batch] + exporters: [otlp] + metrics: + receivers: [otlp] + processors: [resourcedetection, batch] + exporters: [otlp] + metrics/hostmetrics: + receivers: [hostmetrics] + processors: [resourcedetection, batch] + exporters: [otlp] + metrics/prometheus: + receivers: [prometheus] + processors: [resourcedetection, batch] + exporters: [otlp] + logs: + receivers: [otlp, tcplog/docker] + processors: [resourcedetection, batch] + exporters: [otlp] diff --git a/iac/ansible/resources/signoz-infra/config/otel-metrics-config.v1.yaml b/iac/ansible/resources/signoz-infra/config/otel-metrics-config.v1.yaml new file mode 100644 index 0000000..e44bbc6 --- /dev/null +++ b/iac/ansible/resources/signoz-infra/config/otel-metrics-config.v1.yaml @@ -0,0 +1,103 @@ +receivers: + prometheus: + config: + global: + scrape_interval: 60s + scrape_configs: + - job_name: otel-metrics + static_configs: + - targets: + - localhost:8888 + labels: + job_name: otel-metrics + # For Docker daemon metrics to be scraped, it must be configured to expose + # Prometheus metrics, as documented here: https://docs.docker.com/config/daemon/prometheus/ + # - job_name: docker-daemon + # dockerswarm_sd_configs: + # - host: unix:///var/run/docker.sock + # role: nodes + # relabel_configs: + # - source_labels: [__meta_dockerswarm_node_address] + # target_label: __address__ + # replacement: $1:9323 + - job_name: "dockerswarm" + dockerswarm_sd_configs: + - host: unix:///var/run/docker.sock + role: tasks + relabel_configs: + - action: keep + regex: running + source_labels: + - __meta_dockerswarm_task_desired_state + - action: keep + regex: true + source_labels: + - __meta_dockerswarm_service_label_signoz_io_scrape + - regex: ([^:]+)(?::\d+)? + replacement: $1 + source_labels: + - __address__ + target_label: swarm_container_ip + - separator: . + source_labels: + - __meta_dockerswarm_service_name + - __meta_dockerswarm_task_slot + - __meta_dockerswarm_task_id + target_label: swarm_container_name + - target_label: __address__ + source_labels: + - swarm_container_ip + - __meta_dockerswarm_service_label_signoz_io_port + separator: ":" + - source_labels: + - __meta_dockerswarm_service_label_signoz_io_path + target_label: __metrics_path__ + - source_labels: + - __meta_dockerswarm_service_label_com_docker_stack_namespace + target_label: namespace + - source_labels: + - __meta_dockerswarm_service_name + target_label: service_name + - source_labels: + - __meta_dockerswarm_task_id + target_label: service_instance_id + - source_labels: + - __meta_dockerswarm_node_hostname + target_label: host_name +processors: + batch: + send_batch_size: 10000 + send_batch_max_size: 11000 + timeout: 10s + resourcedetection: + detectors: + - env + - system + timeout: 2s +extensions: + health_check: + endpoint: 0.0.0.0:13133 + pprof: + endpoint: 0.0.0.0:1777 +exporters: + otlp: + endpoint: ${env:SIGNOZ_COLLECTOR_ENDPOINT} + tls: + insecure: true + headers: + signoz-access-token: ${env:SIGNOZ_ACCESS_TOKEN} + # debug: {} +service: + telemetry: + logs: + encoding: json + metrics: + address: 0.0.0.0:8888 + extensions: + - health_check + - pprof + pipelines: + metrics: + receivers: [prometheus] + processors: [resourcedetection, batch] + exporters: [otlp] diff --git a/iac/ansible/resources/signoz-infra/signoz-infra.yml b/iac/ansible/resources/signoz-infra/signoz-infra.yml new file mode 100644 index 0000000..62494a8 --- /dev/null +++ b/iac/ansible/resources/signoz-infra/signoz-infra.yml @@ -0,0 +1,78 @@ +version: "3" +x-common: &common + networks: + - signoz-net + extra_hosts: + - host.docker.internal:host-gateway + logging: + options: + max-size: 50m + max-file: "3" + deploy: + mode: global + restart_policy: + condition: on-failure +services: + otel-agent: + <<: *common + image: otel/opentelemetry-collector-contrib:0.111.0 + command: + - --config=/etc/otel-collector-config.yaml + configs: + - source: otel-agent-config-v1 + target: /etc/otel-collector-config.yaml + volumes: + - /:/hostfs:ro + environment: + - SIGNOZ_COLLECTOR_ENDPOINT=http://host.docker.internal:4317 # In case of external SigNoz or cloud, update the endpoint and access token + - OTEL_RESOURCE_ATTRIBUTES=host.name={{.Node.Hostname}},os.type={{.Node.Platform.OS}} + # - SIGNOZ_ACCESS_TOKEN="" + # Before exposing the ports, make sure the ports are not used by other services + # ports: + # - "4317:4317" + # - "4318:4318" + otel-metrics: + <<: *common + image: otel/opentelemetry-collector-contrib:0.111.0 + user: 0:0 # If you have security concerns, you can replace this with your `UID:GID` that has necessary permissions to docker.sock + command: + - --config=/etc/otel-collector-config.yaml + configs: + - source: otel-metrics-config-v1 + target: /etc/otel-collector-config.yaml + volumes: + - /var/run/docker.sock:/var/run/docker.sock + environment: + - SIGNOZ_COLLECTOR_ENDPOINT=http://host.docker.internal:4317 # In case of external SigNoz or cloud, update the endpoint and access token + - OTEL_RESOURCE_ATTRIBUTES=host.name={{.Node.Hostname}},os.type={{.Node.Platform.OS}} + # - SIGNOZ_ACCESS_TOKEN="" + # Before exposing the ports, make sure the ports are not used by other services + # ports: + # - "4317:4317" + # - "4318:4318" + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.role == manager + logspout: + <<: *common + image: "gliderlabs/logspout:v3.2.14" + command: syslog+tcp://otel-agent:2255 + user: root + volumes: + - /etc/hostname:/etc/host_hostname:ro + - /var/run/docker.sock:/var/run/docker.sock + depends_on: + - otel-agent + +networks: + signoz-net: + name: signoz-net + external: true +configs: + otel-metrics-config-v1: + file: /mnt/cephfs/signoz-infra/config/otel-metrics-config.v1.yaml + otel-agent-config-v1: + file: /mnt/cephfs/signoz-infra/config/otel-agent-config.v1.yaml \ No newline at end of file diff --git a/iac/ansible/resources/signoz/config/clickhouse/cluster.v1.xml b/iac/ansible/resources/signoz/config/clickhouse/cluster.v1.xml new file mode 100644 index 0000000..8b475ff --- /dev/null +++ b/iac/ansible/resources/signoz/config/clickhouse/cluster.v1.xml @@ -0,0 +1,75 @@ + + + + + + zookeeper-1 + 2181 + + + + + + + + + + + + + + + + clickhouse + 9000 + + + + + + + + diff --git a/iac/ansible/resources/signoz/config/clickhouse/config.v1.xml b/iac/ansible/resources/signoz/config/clickhouse/config.v1.xml new file mode 100644 index 0000000..1965ac3 --- /dev/null +++ b/iac/ansible/resources/signoz/config/clickhouse/config.v1.xml @@ -0,0 +1,1142 @@ + + + + + + information + + json + + /var/log/clickhouse-server/clickhouse-server.log + /var/log/clickhouse-server/clickhouse-server.err.log + + 1000M + 10 + + + + + + + + + + + + + + + + + + 8123 + + + 9000 + + + 9004 + + + 9005 + + + + + + + + + + + + 9009 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 4096 + + + 3 + + + + + false + + + /path/to/ssl_cert_file + /path/to/ssl_key_file + + + false + + + /path/to/ssl_ca_cert_file + + + none + + + 0 + + + -1 + -1 + + + false + + + + + + + + + + + none + true + true + sslv2,sslv3 + true + + + + true + true + sslv2,sslv3 + true + + + + RejectCertificateHandler + + + + + + + + + 100 + + + 0 + + + + 10000 + + + + + + 0.9 + + + 4194304 + + + 0 + + + + + + 8589934592 + + + 5368709120 + + + + 1000 + + + 134217728 + + + 10000 + + + /var/lib/clickhouse/ + + + /var/lib/clickhouse/tmp/ + + + + ` + + + + + + /var/lib/clickhouse/user_files/ + + + + + + + + + + + + + users.xml + + + + /var/lib/clickhouse/access/ + + + + + + + default + + + + + + + + + + + + default + + + + + + + + + true + + + false + + ' | sed -e 's|.*>\(.*\)<.*|\1|') + wget https://github.com/ClickHouse/clickhouse-jdbc-bridge/releases/download/v$PKG_VER/clickhouse-jdbc-bridge_$PKG_VER-1_all.deb + apt install --no-install-recommends -f ./clickhouse-jdbc-bridge_$PKG_VER-1_all.deb + clickhouse-jdbc-bridge & + + * [CentOS/RHEL] + export MVN_URL=https://repo1.maven.org/maven2/ru/yandex/clickhouse/clickhouse-jdbc-bridge + export PKG_VER=$(curl -sL $MVN_URL/maven-metadata.xml | grep '' | sed -e 's|.*>\(.*\)<.*|\1|') + wget https://github.com/ClickHouse/clickhouse-jdbc-bridge/releases/download/v$PKG_VER/clickhouse-jdbc-bridge-$PKG_VER-1.noarch.rpm + yum localinstall -y clickhouse-jdbc-bridge-$PKG_VER-1.noarch.rpm + clickhouse-jdbc-bridge & + + Please refer to https://github.com/ClickHouse/clickhouse-jdbc-bridge#usage for more information. + ]]> + + + + + + + + + + + + + + + 01 + example01-01-1 + + + + + + 3600 + + + + 3600 + + + 60 + + + + + + + + + + /metrics + 9363 + + true + true + true + true + + + + + + system + query_log
+ + toYYYYMM(event_date) + + + + + + 7500 +
+ + + + system + trace_log
+ + toYYYYMM(event_date) + 7500 +
+ + + + system + query_thread_log
+ toYYYYMM(event_date) + 7500 +
+ + + + system + query_views_log
+ toYYYYMM(event_date) + 7500 +
+ + + + system + part_log
+ toYYYYMM(event_date) + 7500 +
+ + + + + + system + metric_log
+ 7500 + 1000 +
+ + + + system + asynchronous_metric_log
+ + 7000 +
+ + + + + + engine MergeTree + partition by toYYYYMM(finish_date) + order by (finish_date, finish_time_us, trace_id) + + system + opentelemetry_span_log
+ 7500 +
+ + + + + system + crash_log
+ + + 1000 +
+ + + + + + + system + processors_profile_log
+ + toYYYYMM(event_date) + 7500 +
+ + + + + + + + + *_dictionary.xml + + + *function.xml + /var/lib/clickhouse/user_scripts/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + /clickhouse/task_queue/ddl + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + click_cost + any + + 0 + 3600 + + + 86400 + 60 + + + + max + + 0 + 60 + + + 3600 + 300 + + + 86400 + 3600 + + + + + + /var/lib/clickhouse/format_schemas/ + + + + + hide encrypt/decrypt arguments + ((?:aes_)?(?:encrypt|decrypt)(?:_mysql)?)\s*\(\s*(?:'(?:\\'|.)+'|.*?)\s*\) + + \1(???) + + + + + + + + + + false + + false + + + https://6f33034cfe684dd7a3ab9875e57b1c8d@o388870.ingest.sentry.io/5226277 + + + + + + + + + + + 268435456 + true + +
diff --git a/iac/ansible/resources/signoz/config/clickhouse/custom-function.v1.xml b/iac/ansible/resources/signoz/config/clickhouse/custom-function.v1.xml new file mode 100644 index 0000000..b2b3f91 --- /dev/null +++ b/iac/ansible/resources/signoz/config/clickhouse/custom-function.v1.xml @@ -0,0 +1,21 @@ + + + executable + histogramQuantile + Float64 + + Array(Float64) + buckets + + + Array(Float64) + counts + + + Float64 + quantile + + CSV + ./histogramQuantile + + diff --git a/iac/ansible/resources/signoz/config/clickhouse/storage.v1.xml b/iac/ansible/resources/signoz/config/clickhouse/storage.v1.xml new file mode 100644 index 0000000..54ec497 --- /dev/null +++ b/iac/ansible/resources/signoz/config/clickhouse/storage.v1.xml @@ -0,0 +1,41 @@ + + + + + + 10485760 + + + s3 + + https://BUCKET-NAME.s3-REGION-NAME.amazonaws.com/data/ + ACCESS-KEY-ID + SECRET-ACCESS-KEY + + + + + + + + + + + default + + + s3 + 0 + + + + + + diff --git a/iac/ansible/resources/signoz/config/clickhouse/users.v1.xml b/iac/ansible/resources/signoz/config/clickhouse/users.v1.xml new file mode 100644 index 0000000..f185620 --- /dev/null +++ b/iac/ansible/resources/signoz/config/clickhouse/users.v1.xml @@ -0,0 +1,123 @@ + + + + + + + + + + 10000000000 + + + random + + + + + 1 + + + + + + + + + + + + + ::/0 + + + + default + + + default + + + + + + + + + + + + + + 3600 + + + 0 + 0 + 0 + 0 + 0 + + + + diff --git a/iac/ansible/resources/signoz/config/otel-collector-config.v4.yaml b/iac/ansible/resources/signoz/config/otel-collector-config.v4.yaml new file mode 100644 index 0000000..399b20b --- /dev/null +++ b/iac/ansible/resources/signoz/config/otel-collector-config.v4.yaml @@ -0,0 +1,140 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + cors: + allowed_origins: + - https://*.genius.ceo + - https://*.avicenna.hamburg + prometheus: + config: + global: + scrape_interval: 60s + scrape_configs: + - job_name: otel-collector + static_configs: + - targets: + - localhost:8888 + labels: + job_name: otel-collector + docker_stats: + endpoint: unix:///var/run/docker.sock + metrics: + container.cpu.utilization: + enabled: true + container.memory.percent: + enabled: true + container.network.io.usage.rx_bytes: + enabled: true + container.network.io.usage.tx_bytes: + enabled: true + container.network.io.usage.rx_dropped: + enabled: true + container.network.io.usage.tx_dropped: + enabled: true + container.memory.usage.limit: + enabled: true + container.memory.usage.total: + enabled: true + container.blockio.io_service_bytes_recursive: + enabled: true +processors: + batch: + send_batch_size: 10000 + send_batch_max_size: 11000 + timeout: 10s + resourcedetection: + # Using OTEL_RESOURCE_ATTRIBUTES envvar, env detector adds custom labels. + detectors: [env, system] + timeout: 2s + resourcedetection/docker: + detectors: [env, docker] + timeout: 2s + override: false + signozspanmetrics/delta: + metrics_exporter: clickhousemetricswrite, signozclickhousemetrics + metrics_flush_interval: 60s + latency_histogram_buckets: [100us, 1ms, 2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1000ms, 1400ms, 2000ms, 5s, 10s, 20s, 40s, 60s ] + dimensions_cache_size: 100000 + aggregation_temporality: AGGREGATION_TEMPORALITY_DELTA + enable_exp_histogram: true + dimensions: + - name: service.namespace + default: default + - name: deployment.environment + default: default + # This is added to ensure the uniqueness of the timeseries + # Otherwise, identical timeseries produced by multiple replicas of + # collectors result in incorrect APM metrics + - name: signoz.collector.id + - name: service.version + - name: browser.platform + - name: browser.mobile + - name: k8s.cluster.name + - name: k8s.node.name + - name: k8s.namespace.name + - name: host.name + - name: host.type + - name: container.name +extensions: + health_check: + endpoint: 0.0.0.0:13133 + pprof: + endpoint: 0.0.0.0:1777 +exporters: + clickhousetraces: + datasource: tcp://clickhouse:9000/signoz_traces + low_cardinal_exception_grouping: ${env:LOW_CARDINAL_EXCEPTION_GROUPING} + use_new_schema: true + clickhousemetricswrite: + endpoint: tcp://clickhouse:9000/signoz_metrics + resource_to_telemetry_conversion: + enabled: true + disable_v2: true + clickhousemetricswrite/prometheus: + endpoint: tcp://clickhouse:9000/signoz_metrics + disable_v2: true + signozclickhousemetrics: + dsn: tcp://clickhouse:9000/signoz_metrics + clickhouselogsexporter: + dsn: tcp://clickhouse:9000/signoz_logs + timeout: 10s + use_new_schema: true + otlp: + endpoint: http://otel-collector:4317 + tls: + insecure: true + # debug: {} +service: + telemetry: + logs: + encoding: json + metrics: + address: 0.0.0.0:8888 + extensions: + - health_check + - pprof + pipelines: + traces: + receivers: [otlp] + processors: [signozspanmetrics/delta, batch] + exporters: [clickhousetraces] + metrics/docker: + receivers: [docker_stats] + processors: [resourcedetection/docker] + exporters: [otlp] + metrics: + receivers: [otlp] + processors: [batch] + exporters: [clickhousemetricswrite, signozclickhousemetrics] + metrics/prometheus: + receivers: [prometheus] + processors: [batch] + exporters: [clickhousemetricswrite/prometheus, signozclickhousemetrics] + logs: + receivers: [otlp] + processors: [batch] + exporters: [clickhouselogsexporter] diff --git a/iac/ansible/resources/signoz/config/signoz/otel-collector-opamp-config.yaml b/iac/ansible/resources/signoz/config/signoz/otel-collector-opamp-config.yaml new file mode 100644 index 0000000..7267607 --- /dev/null +++ b/iac/ansible/resources/signoz/config/signoz/otel-collector-opamp-config.yaml @@ -0,0 +1 @@ +server_endpoint: ws://signoz:4320/v1/opamp diff --git a/iac/ansible/resources/signoz/config/signoz/prometheus.v1.yml b/iac/ansible/resources/signoz/config/signoz/prometheus.v1.yml new file mode 100644 index 0000000..683e5e1 --- /dev/null +++ b/iac/ansible/resources/signoz/config/signoz/prometheus.v1.yml @@ -0,0 +1,25 @@ +# my global config +global: + scrape_interval: 5s # Set the scrape interval to every 15 seconds. Default is every 1 minute. + evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. + # scrape_timeout is set to the global default (10s). + +# Alertmanager configuration +alerting: + alertmanagers: + - static_configs: + - targets: + - alertmanager:9093 + +# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. +rule_files: [] + # - "first_rules.yml" + # - "second_rules.yml" + # - 'alerts.yml' + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: [] + +remote_read: + - url: tcp://clickhouse:9000/signoz_metrics diff --git a/iac/ansible/resources/signoz/signoz.yml b/iac/ansible/resources/signoz/signoz.yml new file mode 100644 index 0000000..8a4e476 --- /dev/null +++ b/iac/ansible/resources/signoz/signoz.yml @@ -0,0 +1,243 @@ +version: '3' +x-common: &common + networks: + - signoz-net + deploy: + restart_policy: + condition: on-failure + logging: + options: + max-size: 50m + max-file: '3' +x-clickhouse-defaults: &clickhouse-defaults + !!merge <<: *common + image: clickhouse/clickhouse-server:24.1.2-alpine + tty: true + user: "1000:1000" + deploy: + placement: + constraints: [node.hostname == manager-node-3] + labels: + signoz.io/scrape: 'true' + signoz.io/port: '9363' + signoz.io/path: '/metrics' + depends_on: + - init-clickhouse + - zookeeper-1 + healthcheck: + test: + - CMD + - wget + - --spider + - -q + - 0.0.0.0:8123/ping + interval: 30s + timeout: 5s + retries: 3 + ulimits: + nproc: 65535 + nofile: + soft: 262144 + hard: 262144 +x-zookeeper-defaults: &zookeeper-defaults + !!merge <<: *common + image: bitnami/zookeeper:3.7.1 + user: root + deploy: + placement: + constraints: [node.hostname == manager-node-1] + labels: + signoz.io/scrape: 'true' + signoz.io/port: '9141' + signoz.io/path: '/metrics' + healthcheck: + test: + - CMD-SHELL + - curl -s -m 2 http://localhost:8080/commands/ruok | grep error | grep null + interval: 30s + timeout: 5s + retries: 3 +x-db-depend: &db-depend + !!merge <<: *common + depends_on: + - clickhouse + - schema-migrator +services: + init-clickhouse: + !!merge <<: *common + image: clickhouse/clickhouse-server:24.1.2-alpine + command: + - bash + - -c + - | + version="v0.0.1" + node_os=$$(uname -s | tr '[:upper:]' '[:lower:]') + node_arch=$$(uname -m | sed s/aarch64/arm64/ | sed s/x86_64/amd64/) + echo "Fetching histogram-binary for $${node_os}/$${node_arch}" + cd /tmp + wget -O histogram-quantile.tar.gz "https://github.com/SigNoz/signoz/releases/download/histogram-quantile%2F$${version}/histogram-quantile_$${node_os}_$${node_arch}.tar.gz" + tar -xvzf histogram-quantile.tar.gz + mv histogram-quantile /var/lib/clickhouse/user_scripts/histogramQuantile + deploy: + restart_policy: + condition: on-failure + volumes: + - /mnt/cephfs/signoz/data/clickhouse/user_scripts:/var/lib/clickhouse/user_scripts/ + zookeeper-1: + !!merge <<: *zookeeper-defaults + # ports: + # - "2181:2181" + # - "2888:2888" + # - "3888:3888" + volumes: + - /mnt/cephfs/signoz/data/zookeeper-1:/bitnami/zookeeper + environment: + - ZOO_SERVER_ID=1 + - ALLOW_ANONYMOUS_LOGIN=yes + - ZOO_AUTOPURGE_INTERVAL=1 + - ZOO_ENABLE_PROMETHEUS_METRICS=yes + - ZOO_PROMETHEUS_METRICS_PORT_NUMBER=9141 + clickhouse: + !!merge <<: *clickhouse-defaults + # TODO: needed for clickhouse TCP connectio + hostname: clickhouse + # ports: + # - "9000:9000" + # - "8123:8123" + # - "9181:9181" + configs: + - source: clickhouse-config-v1 + target: /etc/clickhouse-server/config.xml + - source: clickhouse-users-v1 + target: /etc/clickhouse-server/users.xml + - source: clickhouse-custom-function-v1 + target: /etc/clickhouse-server/custom-function.xml + - source: clickhouse-cluster-v1 + target: /etc/clickhouse-server/config.d/cluster.xml + volumes: + - /mnt/cephfs/signoz/data/clickhouse/data/user_scripts:/var/lib/clickhouse/user_scripts/ + - /mnt/cephfs/signoz/data/clickhouse/data:/var/lib/clickhouse/ + # - ../common/clickhouse/storage.xml:/etc/clickhouse-server/config.d/storage.xml + signoz: + !!merge <<: *db-depend + image: signoz/signoz:v0.86.1 + command: + - --config=/root/config/prometheus.yml + # ports: + # - "8080:8080" # signoz port + # - "6060:6060" # pprof port + configs: + - source: signoz-prometheus-config-v1 + target: /root/config/prometheus.yml + volumes: + - /mnt/cephfs/signoz/data/dashboards:/root/config/dashboards + - /mnt/cephfs/signoz/data/sqlite:/var/lib/signoz/ + environment: + - SIGNOZ_ALERTMANAGER_PROVIDER=signoz + - SIGNOZ_TELEMETRYSTORE_CLICKHOUSE_DSN=tcp://clickhouse:9000 + - SIGNOZ_SQLSTORE_SQLITE_PATH=/var/lib/signoz/signoz.db + - DASHBOARDS_PATH=/root/config/dashboards + - STORAGE=clickhouse + - GODEBUG=netdns=go + - TELEMETRY_ENABLED=true + - DEPLOYMENT_TYPE=docker-swarm + healthcheck: + test: + - CMD + - wget + - --spider + - -q + - localhost:8080/api/v1/health + interval: 30s + timeout: 5s + retries: 3 + networks: + - signoz-net + - traefik_public + deploy: + labels: + - 'traefik.enable=true' + # --- Router für Signoz UI --- + - 'traefik.http.routers.signoz.rule=Host(`signoz.genius.ceo`)' + - 'traefik.http.routers.signoz.entrypoints=https' + - 'traefik.http.routers.signoz.tls.certresolver=main' + # --- Service für Signoz UI --- + - 'traefik.http.services.signoz.loadbalancer.server.port=8080' + # --- Netzwerk für Traefik --- + - 'traefik.swarm.network=traefik_public' + otel-collector: + !!merge <<: *db-depend + image: signoz/signoz-otel-collector:v0.111.42 + user: root + command: + - --config=/etc/otel-collector-config.yaml + - --manager-config=/etc/manager-config.yaml + - --copy-path=/var/tmp/collector-config.yaml + - --feature-gates=-pkg.translator.prometheus.NormalizeName + configs: + - source: otel-collector-config-v4 + target: /etc/otel-collector-config.yaml + - source: otel-collector-manager-config-v1 + target: /etc/manager-config.yaml + volumes: + - /var/run/docker.sock:/var/run/docker.sock:ro + environment: + - OTEL_RESOURCE_ATTRIBUTES=host.name={{.Node.Hostname}},os.type={{.Node.Platform.OS}} + - LOW_CARDINAL_EXCEPTION_GROUPING=false + ports: + # - "1777:1777" # pprof extension + - '4317:4317' # OTLP gRPC receiver + - '4318:4318' # OTLP HTTP receiver + deploy: + replicas: 3 + labels: + - 'traefik.enable=true' + # --- Router für Signoz Collector UI --- + - 'traefik.http.routers.signoz-collector.rule=Host(`collector.genius.ceo`)' + - 'traefik.http.routers.signoz-collector.entrypoints=https' + - 'traefik.http.routers.signoz-collector.tls.certresolver=main' + # --- Service für Signoz Collector UI --- + - 'traefik.http.services.signoz-collector.loadbalancer.server.port=4318' + # --- Netzwerk für Traefik --- + - 'traefik.swarm.network=traefik_public' + depends_on: + - clickhouse + - schema-migrator + - signoz + networks: + - signoz-net + - traefik_public + schema-migrator: + !!merge <<: *common + image: signoz/signoz-schema-migrator:v0.111.42 + deploy: + restart_policy: + condition: on-failure + delay: 5s + entrypoint: sh + command: + - -c + - '/signoz-schema-migrator sync --dsn=tcp://clickhouse:9000 --up= && /signoz-schema-migrator async --dsn=tcp://clickhouse:9000 --up=' + depends_on: + - clickhouse +networks: + signoz-net: + name: signoz-net + attachable: true + traefik_public: + external: true +configs: + otel-collector-config-v4: + file: /mnt/cephfs/signoz/config/otel-collector-config.v4.yaml + otel-collector-manager-config-v1: + file: /mnt/cephfs/signoz/config/signoz/otel-collector-opamp-config.yaml + clickhouse-config-v1: + file: /mnt/cephfs/signoz/config/clickhouse/config.v1.xml + clickhouse-users-v1: + file: /mnt/cephfs/signoz/config/clickhouse/users.v1.xml + clickhouse-custom-function-v1: + file: /mnt/cephfs/signoz/config/clickhouse/custom-function.v1.xml + clickhouse-cluster-v1: + file: /mnt/cephfs/signoz/config/clickhouse/cluster.v1.xml + signoz-prometheus-config-v1: + file: /mnt/cephfs/signoz/config/signoz/prometheus.v1.yml diff --git a/iac/ansible/roles/authentik/tasks/main.yml b/iac/ansible/roles/authentik/tasks/main.yml new file mode 100644 index 0000000..9d8b2a9 --- /dev/null +++ b/iac/ansible/roles/authentik/tasks/main.yml @@ -0,0 +1,33 @@ +--- +# - name: AUTHENTIK | Verzeichnisse erstellen und Berechtigungen setzen +# ansible.builtin.file: +# path: "/mnt/cephfs/authentik/data/{{ item }}" +# state: directory +# owner: 1000 +# group: 1000 +# mode: '0755' +# loop: +# - cache +# - certs +# - db +# - media +# - templates +# run_once: true +# delegate_to: "{{ groups['managers'][0] }}" + +- name: AUTHENTIK | Generate Compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: /mnt/cephfs/authentik/authentik.yml + mode: 0644 + run_once: true + delegate_to: "{{ groups['managers'][0] }}" + +- name: AUTHENTIK | Deploy app stack + community.docker.docker_stack: + state: present + name: authentik + compose: + - /mnt/cephfs/authentik/authentik.yml + delegate_to: "{{ groups['managers'][0] }}" + run_once: true \ No newline at end of file diff --git a/iac/ansible/roles/authentik/templates/docker-compose.yml.j2 b/iac/ansible/roles/authentik/templates/docker-compose.yml.j2 new file mode 100644 index 0000000..d8c996f --- /dev/null +++ b/iac/ansible/roles/authentik/templates/docker-compose.yml.j2 @@ -0,0 +1,100 @@ +--- +networks: + traefik_public: + external: true + internal: + +services: + postgresql: + image: docker.io/library/postgres:16-alpine + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "pg_isready -d $POSTGRES_DB -U $POSTGRES_USER"] + start_period: 20s + interval: 30s + retries: 5 + timeout: 5s + volumes: + - /mnt/cephfs/authentik/data/db:/var/lib/postgresql/data + environment: + POSTGRES_PASSWORD: "{{ pg_pass }}" + POSTGRES_USER: "{{ pg_user | default('authentik') }}" + POSTGRES_DB: "{{ pg_db | default('authentik') }}" + networks: + - internal + + redis: + image: docker.io/library/redis:alpine + command: --save 60 1 --loglevel warning + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "redis-cli ping | grep PONG"] + start_period: 20s + interval: 30s + retries: 5 + timeout: 3s + volumes: + - /mnt/cephfs/authentik/data/cache:/data + networks: + - internal + + server: + image: "{{ authentik_image | default('ghcr.io/goauthentik/server') }}:{{ authentik_tag | default('2025.6.3') }}" + restart: unless-stopped + command: server + environment: + AUTHENTIK_SECRET_KEY: "{{ authentik_secret_key }}" + AUTHENTIK_REDIS__HOST: redis + AUTHENTIK_POSTGRESQL__HOST: postgresql + AUTHENTIK_POSTGRESQL__USER: "{{ pg_user | default('authentik') }}" + AUTHENTIK_POSTGRESQL__NAME: "{{ pg_db | default('authentik') }}" + AUTHENTIK_POSTGRESQL__PASSWORD: "{{ pg_pass }}" + AUTHENTIK_ERROR_REPORTING__ENABLED: "false" + volumes: + - /mnt/cephfs/authentik/data/media:/media + - /mnt/cephfs/authentik/data/templates:/templates + networks: + - traefik_public + - internal + deploy: + labels: + traefik.enable: "true" + traefik.swarm.network: {{ traefik_net }} + traefik.http.routers.authentik.rule: Host(`{{ traefik_route }}`) || HostRegexp(`{subdomain:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?}.genius.ceo`) && PathPrefix(`/outpost.goauthentik.io/`) + traefik.http.routers.authentik.entrypoints: https + traefik.http.routers.authentik.tls: "true" + traefik.http.routers.authentik.tls.certresolver: main + traefik.http.services.authentik.loadbalancer.server.port: 9000 + # - "traefik.enable=true" + # - "traefik.swarm.network={{ traefik_net }}" + # - "traefik.http.routers.authentik.rule=Host(`{{ traefik_route }}`) || HostRegexp(`{subdomain:[A-Za-z0-9](?:[A-Za-z0-9\-]{0,61}[A-Za-z0-9])?}.genius.ceo`) && PathPrefix(`/outpost.goauthentik.io/`)" + # - "traefik.http.routers.authentik.entrypoints=https" + # - "traefik.http.routers.authentik.tls=true" + # - "traefik.http.routers.authentik.tls.certresolver=main" + # - "traefik.http.services.authentik.loadbalancer.server.port=9000" + + worker: + image: "{{ authentik_image | default('ghcr.io/goauthentik/server') }}:{{ authentik_tag | default('2025.6.3') }}" + restart: unless-stopped + command: worker + environment: + AUTHENTIK_SECRET_KEY: "{{ authentik_secret_key }}" + AUTHENTIK_REDIS__HOST: redis + AUTHENTIK_POSTGRESQL__HOST: postgresql + AUTHENTIK_POSTGRESQL__USER: "{{ pg_user | default('authentik') }}" + AUTHENTIK_POSTGRESQL__NAME: "{{ pg_db | default('authentik') }}" + AUTHENTIK_POSTGRESQL__PASSWORD: "{{ pg_pass }}" + # `user: root` and the docker socket volume are optional. + # See more for the docker socket integration here: + # https://goauthentik.io/docs/outposts/integrations/docker + # Removing `user: root` also prevents the worker from fixing the permissions + # on the mounted folders, so when removing this make sure the folders have the correct UID/GID + # (1000:1000 by default) + user: root + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - /mnt/cephfs/authentik/data/media:/media + - /mnt/cephfs/authentik/data/certs:/certs + - /mnt/cephfs/authentik/data/templates:/templates + networks: + - internal diff --git a/iac/ansible/roles/authentik/vars/main.yml b/iac/ansible/roles/authentik/vars/main.yml new file mode 100644 index 0000000..54bbe93 --- /dev/null +++ b/iac/ansible/roles/authentik/vars/main.yml @@ -0,0 +1,11 @@ +--- +authentik_image: "ghcr.io/goauthentik/server" +authentik_tag: "2025.6.3" +authentik_secret_key: "" + +pg_user: "authentik" +pg_pass: "" +pg_db: "authentik" + +traefik_net: "traefik_public" +traefik_route: "auth.genius.ceo" \ No newline at end of file diff --git a/iac/ansible/roles/ceph_setup/tasks/main.yml b/iac/ansible/roles/ceph_setup/tasks/main.yml new file mode 100644 index 0000000..91f7252 --- /dev/null +++ b/iac/ansible/roles/ceph_setup/tasks/main.yml @@ -0,0 +1,93 @@ +--- +- name: CEPH | Private IP des ersten Managers ermitteln + ansible.builtin.set_fact: + ceph_bootstrap_ip: "{{ hostvars[inventory_hostname]['ansible_' + private_interface]['ipv4']['address'] }}" + when: inventory_hostname == groups['managers'][0] + +- name: CEPH | Cluster auf dem ersten Manager initialisieren (Bootstrap) + ansible.builtin.command: + cmd: "cephadm bootstrap --mon-ip {{ ceph_bootstrap_ip }}" + creates: /etc/ceph/ceph.conf + when: inventory_hostname == groups['managers'][0] + +- name: CEPH | Öffentlichen SSH-Schlüssel von cephadm abrufen + ansible.builtin.command: "cephadm shell -- ceph cephadm get-pub-key" + register: cephadm_pub_key + changed_when: false + delegate_to: "{{ groups['managers'][0] }}" + run_once: true + +- name: CEPH | Öffentlichen Schlüssel von cephadm auf allen Knoten für root verteilen + ansible.posix.authorized_key: + user: root + key: "{{ hostvars[groups['managers'][0]]['cephadm_pub_key'].stdout }}" + state: present + key_options: 'no-port-forwarding,no-X11-forwarding,no-agent-forwarding,no-pty' + +- name: CEPH | Andere Knoten zum Ceph-Cluster hinzufügen + ansible.builtin.command: + cmd: "ceph orch host add {{ item }} {{ hostvars[item]['ansible_' + private_interface]['ipv4']['address'] }}" + loop: "{{ groups['all'] }}" + delegate_to: "{{ groups['managers'][0] }}" + run_once: true + +- name: CEPH | Prüfen, ob bereits OSDs (Speichergeräte) vorhanden sind + ansible.builtin.command: "ceph osd ls" + register: existing_osds + changed_when: false + failed_when: false + delegate_to: "{{ groups['managers'][0] }}" + run_once: true + +- name: CEPH | Spezifische Festplatte ({{ ceph_osd_device }}) auf jedem Knoten als OSD hinzufügen + ansible.builtin.command: "ceph orch daemon add osd {{ item }}:{{ ceph_osd_device }}" + loop: "{{ groups['all'] }}" + delegate_to: "{{ groups['managers'][0] }}" + run_once: true + when: existing_osds.stdout | length == 0 + +- name: CEPH | Prüfen, ob CephFS bereits existiert + ansible.builtin.command: "ceph fs ls -f json" + register: cephfs_list + changed_when: false + delegate_to: "{{ groups['managers'][0] }}" + run_once: true + +- name: CEPH | CephFS Pools und Dateisystem erstellen, falls nicht vorhanden + block: + - name: Metadaten-Pool für CephFS erstellen + ansible.builtin.command: "ceph osd pool create {{ cephfs_name }}_metadata" + - name: Daten-Pool für CephFS erstellen + ansible.builtin.command: "ceph osd pool create {{ cephfs_name }}_data" + - name: CephFS-Dateisystem erstellen + ansible.builtin.command: "ceph fs new {{ cephfs_name }} {{ cephfs_name }}_metadata {{ cephfs_name }}_data" + when: cephfs_list.stdout | from_json | length == 0 + delegate_to: "{{ groups['managers'][0] }}" + run_once: true + +- name: CEPH | Metadaten-Server (MDS) für CephFS starten + ansible.builtin.command: "ceph orch apply mds {{ cephfs_name }} --placement=2" + delegate_to: "{{ groups['managers'][0] }}" + run_once: true + when: cephfs_list.stdout | from_json | length == 0 + +- name: CEPH | Ceph Admin-Schlüssel für das Mounten abrufen + ansible.builtin.command: "ceph auth get-key client.admin" + register: ceph_admin_key + changed_when: false + delegate_to: "{{ groups['managers'][0] }}" + run_once: true + +- name: CEPH | Mount-Punkt für CephFS erstellen + ansible.builtin.file: + path: /mnt/cephfs + state: directory + mode: '0755' + +- name: CEPH | CephFS auf allen Knoten mounten (und in /etc/fstab eintragen) + ansible.posix.mount: + path: /mnt/cephfs + src: "{{ hostvars[groups['managers'][0]]['ceph_bootstrap_ip'] }}:/" + fstype: ceph + opts: "name=admin,secret={{ ceph_admin_key.stdout }}" + state: mounted \ No newline at end of file diff --git a/iac/ansible/roles/common/tasks/main.yml b/iac/ansible/roles/common/tasks/main.yml new file mode 100644 index 0000000..a1bf357 --- /dev/null +++ b/iac/ansible/roles/common/tasks/main.yml @@ -0,0 +1,108 @@ +--- +- name: COMMON | Systempakete aktualisieren und upgraden + ansible.builtin.apt: + update_cache: true + upgrade: dist + autoremove: true + autoclean: true + +- name: COMMON | Notwendige Pakete installieren + ansible.builtin.apt: + name: + - ufw + - fail2ban + - unattended-upgrades + - apt-listchanges + - docker-ce + - python3-pip + - chrony + - lvm2 + - cephadm + - ceph-common + state: present + +- name: COMMON | Chrony Dienst starten und aktivieren + ansible.builtin.service: + name: chronyd + state: started + enabled: true + +- name: COMMON | Docker Dienst starten und aktivieren + ansible.builtin.service: + name: docker + state: started + enabled: true + +- name: COMMON | Einen dedizierten Admin-Benutzer erstellen + ansible.builtin.user: + name: "{{ admin_user }}" + password: "{{ admin_password}}" + shell: /bin/bash + groups: sudo,docker + append: true + state: present + +- name: COMMON | SSH-Schlüssel für den Admin-Benutzer einrichten + ansible.posix.authorized_key: + user: "{{ admin_user }}" + key: "{{ item }}" + state: present + with_items: "{{ authorized_keys }}" + +- name: COMMON | cephadm-Benutzer erstellen + ansible.builtin.user: + name: "cephadm" + password: "{{ cephadm_password }}" + shell: /bin/bash + groups: sudo,docker + append: yes + state: present + +- name: COMMON | .ssh Verzeichnis für cephadm-Benutzer erstellen + ansible.builtin.file: + path: /home/cephadm/.ssh + state: directory + +- name: COMMON | Passwortloses Sudo für cephadm-Benutzer erlauben + ansible.builtin.copy: + dest: "/etc/sudoers.d/91-cephadm-nopasswd" + content: "cephadm ALL=(ALL) NOPASSWD: ALL" + mode: '0440' + validate: 'visudo -cf %s' + +- name: COMMON | ed25519 SSH-Schlüssel für cephadm-Benutzer generieren (nur auf dem ersten Manager) + community.crypto.openssh_keypair: + path: /home/cephadm/.ssh/id_ed25519 + type: ed25519 + owner: cephadm + group: cephadm + mode: '0600' + when: inventory_hostname == groups['managers'][0] + +- name: COMMON | Öffentlichen SSH-Schlüssel von cephadm abrufen + ansible.builtin.slurp: + src: /home/cephadm/.ssh/id_ed25519.pub + register: cephadm_ssh_pub_key + when: inventory_hostname == groups['managers'][0] + +- name: COMMON | Öffentlichen SSH-Schlüssel von cephadm auf allen Knoten verteilen + ansible.posix.authorized_key: + user: cephadm + key: "{{ hostvars[groups['managers'][0]]['cephadm_ssh_pub_key']['content'] | b64decode }}" + state: present + +- name: COMMON | Automatische Sicherheitsupdates konfigurieren + ansible.builtin.copy: + src: assets/50unattended-upgrades + dest: /etc/apt/apt.conf.d/50unattended-upgrades + owner: root + group: root + mode: '0644' + +- name: COMMON | Periodische Auto-Updates aktivieren + ansible.builtin.copy: + src: assets/20auto-upgrades + dest: /etc/apt/apt.conf.d/20auto-upgrades + owner: root + group: root + mode: '0644' \ No newline at end of file diff --git a/iac/ansible/roles/docker_swarm/tasks/main.yml b/iac/ansible/roles/docker_swarm/tasks/main.yml new file mode 100644 index 0000000..f18b250 --- /dev/null +++ b/iac/ansible/roles/docker_swarm/tasks/main.yml @@ -0,0 +1,58 @@ +--- +- name: SWARM | Ensure Docker SDK for Python is installed + ansible.builtin.apt: + name: python3-docker + state: present + +- name: SWARM | Get interface IP address for the manager + ansible.builtin.set_fact: + manager_ip: "{{ hostvars[inventory_hostname]['ansible_' + private_interface]['ipv4']['address'] }}" + when: inventory_hostname == groups['managers'][0] + +- name: SWARM | Initialize the Docker Swarm + community.docker.docker_swarm: + state: present + advertise_addr: "{{ manager_ip }}" + when: inventory_hostname == groups['managers'][0] + register: swarm_init_result + +- name: SWARM | Get the join tokens + community.docker.docker_swarm_info: + register: swarm_info + when: inventory_hostname == groups['managers'][0] + +- name: SWARM | Verify that join tokens were fetched + ansible.builtin.assert: + that: + - swarm_info is defined + - swarm_info.swarm_facts is defined + - swarm_info.swarm_facts.JoinTokens.Manager is defined + - swarm_info.swarm_facts.JoinTokens.Worker is defined + fail_msg: "Konnte die Join-Tokens vom Swarm Manager nicht abrufen. Ist der Swarm korrekt initialisiert?" + success_msg: "Join-Tokens erfolgreich abgerufen." + when: inventory_hostname == groups['managers'][0] + +- name: SWARM | Join manager nodes to the Swarm + community.docker.docker_swarm: + state: join + remote_addrs: [ "{{ hostvars[groups['managers'][0]]['manager_ip'] }}:2377" ] + join_token: "{{ hostvars[groups['managers'][0]]['swarm_info']['swarm_facts']['JoinTokens']['Manager'] }}" + when: inventory_hostname in groups['managers'] + +- name: SWARM | Join worker nodes to the Swarm + community.docker.docker_swarm: + state: join + remote_addrs: [ "{{ hostvars[groups['managers'][0]]['manager_ip'] }}:2377" ] + join_token: "{{ hostvars[groups['managers'][0]]['swarm_info']['swarm_facts']['JoinTokens']['Worker'] }}" + when: inventory_hostname in groups['workers'] + +- name: SWARM | Verify Swarm Cluster State (run on manager) + ansible.builtin.command: docker node ls + register: swarm_nodes + changed_when: false + when: inventory_hostname == groups['managers'][0] + +- name: SWARM | Display cluster state + ansible.builtin.debug: + msg: "{{ swarm_nodes.stdout_lines }}" + when: inventory_hostname == groups['managers'][0] \ No newline at end of file diff --git a/iac/ansible/roles/dockge/defaults/main.yml b/iac/ansible/roles/dockge/defaults/main.yml new file mode 100644 index 0000000..2feb594 --- /dev/null +++ b/iac/ansible/roles/dockge/defaults/main.yml @@ -0,0 +1,3 @@ +--- +dockge_stacks_dir: /mnt/cephfs/dockge/stacks +dockge_data_dir: /mnt/cephfs/dockge/data diff --git a/iac/ansible/roles/dockge/tasks/main.yml b/iac/ansible/roles/dockge/tasks/main.yml new file mode 100644 index 0000000..8ded960 --- /dev/null +++ b/iac/ansible/roles/dockge/tasks/main.yml @@ -0,0 +1,42 @@ +--- +- name: DOCKGE | Copy Stack Files + copy: + directory_mode: true + src: /Users/d3r0/dev/repositories/active/gc/iac/ansible/resources/dockge + dest: /mnt/cephfs + run_once: true + delegate_to: "{{ groups['managers'][0] }}" +# - name: DOCKGE | Sicherstellen, dass das Verzeichnis für die Anwendungs Daten existiert +# ansible.builtin.file: +# path: "{{ dockge_data_dir }}" +# state: directory +# owner: root +# group: root +# mode: '0755' +# become: true + +# - name: DOCKGE | Sicherstellen, dass das Verzeichnis für die Stacks existiert +# ansible.builtin.file: +# path: "{{ dockge_stacks_dir }}" +# state: directory +# owner: root +# group: root +# mode: '0755' +# become: true + +# - name: DOCKGE | Stack aus der Template-Datei bereitstellen +# community.docker.docker_stack: +# state: present +# name: dockge +# compose: +# - "{{ lookup('template', '../../../resources/dockge/dockge.yml') }}" +# delegate_to: "{{ groups['managers'][0] }}" +# run_once: true +- name: DOCKGE | Deploy app stack + community.docker.docker_stack: + state: present + name: dockge + compose: + - /mnt/cephfs/dockge/dockge.yml + delegate_to: "{{ groups['managers'][0] }}" + run_once: true \ No newline at end of file diff --git a/iac/ansible/roles/fail2ban/tasks/main.yml b/iac/ansible/roles/fail2ban/tasks/main.yml new file mode 100644 index 0000000..275505d --- /dev/null +++ b/iac/ansible/roles/fail2ban/tasks/main.yml @@ -0,0 +1,9 @@ +--- +- name: FAIL2BAN | Eine lokale Jail-Konfiguration erstellen + ansible.builtin.template: + src: jail.local.j2 + dest: /etc/fail2ban/jail.local + owner: root + group: root + mode: '0644' + notify: restart fail2ban \ No newline at end of file diff --git a/iac/ansible/roles/fail2ban/templates/jail.local.j2 b/iac/ansible/roles/fail2ban/templates/jail.local.j2 new file mode 100644 index 0000000..0039dca --- /dev/null +++ b/iac/ansible/roles/fail2ban/templates/jail.local.j2 @@ -0,0 +1,8 @@ +[DEFAULT] +bantime = 1h +findtime = 10m +maxretry = 5 + +[sshd] +enabled = true +port = {{ ssh_port }} \ No newline at end of file diff --git a/iac/ansible/roles/gitea/defaults/main.yml b/iac/ansible/roles/gitea/defaults/main.yml new file mode 100644 index 0000000..5e13e41 --- /dev/null +++ b/iac/ansible/roles/gitea/defaults/main.yml @@ -0,0 +1,14 @@ +postgres_version: 16-alpine +gitea_version: "1.21" +gitea_domain: "{{ subdomain }}.{{ main_domain }}" +gitea_http_port: 3000 +gitea_ssh_port: 2222 + +data_dir: "{{ ceph_volume }}/gitea" +subdomain: git + +gitea_db_type: "postgres" +gitea_db_host: db:5432 +gitea_db_name: "gitea" +gitea_db_user: "gitea" +gitea_db_password: "" \ No newline at end of file diff --git a/iac/ansible/roles/gitea/tasks/main.yml b/iac/ansible/roles/gitea/tasks/main.yml new file mode 100644 index 0000000..ac7f071 --- /dev/null +++ b/iac/ansible/roles/gitea/tasks/main.yml @@ -0,0 +1,38 @@ +- name: GITEA | Ensure data directories + ansible.builtin.file: + path: '{{ data_dir }}/data' + state: directory + owner: 1000 + group: 1000 + mode: '0750' + recurse: yes + delegate_to: "{{ groups['managers'][0] }}" + +- name: GITEA | Ensure DB data directories + ansible.builtin.file: + path: "{{ data_dir }}/data/db" + state: directory + # Postgres Alpine nutzt UID 70 (postgres). + # Bei Debian-Images wäre es 999. + owner: 70 + group: 70 + mode: '0700' + recurse: yes + delegate_to: "{{ groups['managers'][0] }}" + +- name: GITEA | Generate Compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: '{{ data_dir }}/gitea.yml' + mode: 0644 + run_once: true + delegate_to: "{{ groups['managers'][0] }}" + +- name: GITEA | Deploy stack + community.docker.docker_stack: + state: present + name: gitea + compose: + - '{{ data_dir }}/gitea.yml' + delegate_to: "{{ groups['managers'][0] }}" + run_once: true diff --git a/iac/ansible/roles/gitea/templates/docker-compose.yml.j2 b/iac/ansible/roles/gitea/templates/docker-compose.yml.j2 new file mode 100644 index 0000000..bde8d7c --- /dev/null +++ b/iac/ansible/roles/gitea/templates/docker-compose.yml.j2 @@ -0,0 +1,62 @@ +networks: + {{ traefik_public_net }}: + external: true + internal: + +services: + server: + image: gitea/gitea:{{ gitea_version }} + environment: + - USER_UID=1000 + - USER_GID=1000 + - GITEA__database__DB_TYPE={{ gitea_db_type }} + - GITEA__database__HOST={{ gitea_db_host }} + - GITEA__database__NAME={{ gitea_db_name }} + - GITEA__database__USER={{ gitea_db_user }} + - GITEA__database__PASSWD={{ gitea_db_password }} + - GITEA__server__DOMAIN={{ gitea_domain }} + - GITEA__server__SSH_DOMAIN={{ gitea_domain }} + - GITEA__server__SSH_PORT={{ gitea_ssh_port }} + - GITEA__server__ROOT_URL=https://{{ gitea_domain }}/ + volumes: + - {{ data_dir }}/data:/data + - /etc/timezone:/etc/timezone:ro + - /etc/localtime:/etc/localtime:ro + networks: + - internal + - {{ traefik_public_net }} + ports: + - "{{ gitea_ssh_port }}:22" + deploy: + mode: replicated + replicas: 1 + labels: + - "traefik.enable=true" + - "traefik.docker.network={{ traefik_public_net }}" + - "traefik.http.routers.gitea.rule=Host(`{{ gitea_domain }}`)" + - "traefik.http.routers.gitea.entrypoints=https" + - "traefik.http.routers.gitea.tls.certresolver=main" + - "traefik.http.services.gitea.loadbalancer.server.port=3000" + + db: + image: postgres:{{ postgres_version }} + restart: always + environment: + - POSTGRES_USER={{ gitea_db_user }} + - POSTGRES_PASSWORD={{ gitea_db_password }} + - POSTGRES_DB={{ gitea_db_name }} + networks: + - internal + volumes: + - {{ data_dir }}/data/db:/var/lib/postgresql/data + command: + - "postgres" + - "-c" + - "fsync=on" + - "-c" + - "full_page_writes=on" + - "-c" + - "synchronous_commit=on" + deploy: + mode: replicated + replicas: 1 \ No newline at end of file diff --git a/iac/ansible/roles/kestra/tasks/main.yml b/iac/ansible/roles/kestra/tasks/main.yml new file mode 100644 index 0000000..fba5cbe --- /dev/null +++ b/iac/ansible/roles/kestra/tasks/main.yml @@ -0,0 +1,47 @@ +--- +- name: KESTRA | Ensure data directory + ansible.builtin.file: + path: '{{ data_dir }}/data/data' + state: directory + mode: '0755' + run_once: true + delegate_to: "{{ groups['managers'][0] }}" + +- name: KESTRA | Ensure db directory + ansible.builtin.file: + path: '{{ data_dir }}/data/db' + state: directory + mode: '0755' + run_once: true + delegate_to: "{{ groups['managers'][0] }}" + +- name: KESTRA | Konfigurationsdatei für tmpfiles.d erstellen + ansible.builtin.copy: + content: "d /tmp/kestra-wd 0755 root root -" + dest: /etc/tmpfiles.d/kestra-wd.conf + owner: root + group: root + mode: '0644' + +- name: KESTRA | Create Kestra working directory + ansible.builtin.file: + path: /tmp/kestra-wd + state: directory + mode: '0755' + +- name: KESTRA | Generate Compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: '{{ data_dir }}/kestra.yml' + mode: 0644 + run_once: true + delegate_to: "{{ groups['managers'][0] }}" + +- name: KESTRA | Deploy stack + community.docker.docker_stack: + state: present + name: kestra + compose: + - /mnt/cephfs/kestra/kestra.yml + delegate_to: "{{ groups['managers'][0] }}" + run_once: true \ No newline at end of file diff --git a/iac/ansible/roles/kestra/templates/docker-compose.yml.j2 b/iac/ansible/roles/kestra/templates/docker-compose.yml.j2 new file mode 100644 index 0000000..de37c27 --- /dev/null +++ b/iac/ansible/roles/kestra/templates/docker-compose.yml.j2 @@ -0,0 +1,92 @@ +networks: + internal: + {{ traefik_public_net }}: + external: true + +services: + postgres: + image: postgres:17 + volumes: + - {{ data_dir }}/data/db:/var/lib/postgresql/data + environment: + POSTGRES_DB: {{ kestra.db.name }} + POSTGRES_USER: {{ kestra.db.user }} + POSTGRES_PASSWORD: "{{ kestra.db.pass }}" + healthcheck: + test: ["CMD-SHELL", "pg_isready -d '$${POSTGRES_DB}' -U $${POSTGRES_USER}"] + interval: 30s + timeout: 10s + retries: 10 + networks: + - internal + deploy: + mode: replicated + replicas: 1 + + kestra: + image: kestra/kestra:v0.24.2 + entrypoint: /bin/bash + # Note that this is meant for development only. Refer to the documentation for production deployments of Kestra which runs without a root user. + user: "root" + command: + - -c + - /app/kestra server standalone --worker-thread=128 + volumes: + - {{ data_dir }}/data/data:/app/storage + - /var/run/docker.sock:/var/run/docker.sock + - /tmp/kestra-wd:/tmp/kestra-wd + environment: + KESTRA_CONFIGURATION: | + datasources: + postgres: + url: jdbc:postgresql://postgres:5432/kestra + driverClassName: org.postgresql.Driver + username: {{ kestra.db.user }} + password: {{ kestra.db.pass }} + kestra: + tutorialFlows: + enabled: false + traces: + root: DEFAULT + micronaut: + metrics: + export: + otlp: + enabled: true + url: http://signoz_otel-collector:4318/v1/metrics + otel: + traces: + exporter: otlp + exporter: + otlp: + endpoint: http://signoz_otel-collector:4318 + server: + basic-auth: + username: {{ kestra.basic_auth.user }} + password: {{ kestra.basic_auth.pass }} + repository: + type: postgres + storage: + type: local + local: + base-path: "/app/storage" + queue: + type: postgres + tasks: + tmp-dir: + path: /tmp/kestra-wd/tmp + url: http://localhost:8080/ + networks: + - {{ traefik_public_net }} + - internal + deploy: + mode: replicated + replicas: 1 + labels: + - "traefik.enable=true" + - "traefik.swarm.network={{ traefik_public_net }}" + - "traefik.http.routers.kestra.rule=Host(`{{ subdomain }}.{{ main_domain }}`)" + - "traefik.http.routers.kestra.entrypoints=https" + - "traefik.http.routers.kestra.tls=true" + - "traefik.http.routers.kestra.tls.certresolver=main" + - "traefik.http.services.kestra.loadbalancer.server.port=8080" diff --git a/iac/ansible/roles/kestra/vars/main.yml b/iac/ansible/roles/kestra/vars/main.yml new file mode 100644 index 0000000..a302259 --- /dev/null +++ b/iac/ansible/roles/kestra/vars/main.yml @@ -0,0 +1,11 @@ +subdomain: kestra +data_dir: "{{ ceph_volume }}/kestra" + +kestra: + basic_auth: + user: "ma@coachhamburg.com" + pass: "igyozi9B87yTeiQ6z2sbe8Y4aQLJV58jdaCNu" + db: + name: kestra + user: kestra + pass: "" \ No newline at end of file diff --git a/iac/ansible/roles/monitoring/tasks/main.yml b/iac/ansible/roles/monitoring/tasks/main.yml new file mode 100644 index 0000000..45befbc --- /dev/null +++ b/iac/ansible/roles/monitoring/tasks/main.yml @@ -0,0 +1,13 @@ +--- +- name: Copy Stack Files + copy: + directory_mode: true + src: /Users/d3r0/dev/repositories/active/gc/iac/ansible/resources/monitoring + dest: /srv +- block: + - name: Deploy Monitoring stack + community.docker.docker_stack: + state: present + name: monitoring + compose: + - /srv/monitoring/observability.yml diff --git a/iac/ansible/roles/portainer/tasks/main.yml b/iac/ansible/roles/portainer/tasks/main.yml new file mode 100644 index 0000000..ece4462 --- /dev/null +++ b/iac/ansible/roles/portainer/tasks/main.yml @@ -0,0 +1,25 @@ +--- +- name: PORTAINER | Ensure data directories + ansible.builtin.file: + path: '{{ data_dir }}/data' + state: directory + mode: '0755' + run_once: true + delegate_to: "{{ groups['managers'][0] }}" + +- name: PORTAINER | Generate Compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: '{{ data_dir }}/portainer.yml' + mode: 0644 + run_once: true + delegate_to: "{{ groups['managers'][0] }}" + +- name: PORTAINER | Deploy stack + community.docker.docker_stack: + state: present + name: portainer + compose: + - '{{ data_dir }}/portainer.yml' + delegate_to: "{{ groups['managers'][0] }}" + run_once: true diff --git a/iac/ansible/roles/portainer/templates/docker-compose.yml.j2 b/iac/ansible/roles/portainer/templates/docker-compose.yml.j2 new file mode 100644 index 0000000..5763891 --- /dev/null +++ b/iac/ansible/roles/portainer/templates/docker-compose.yml.j2 @@ -0,0 +1,37 @@ +version: '3.2' + +services: + agent: + image: portainer/agent:{{ portainer_version }} + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - /var/lib/docker/volumes:/var/lib/docker/volumes + networks: + - {{ traefik_public_net }} + deploy: + mode: global + placement: + constraints: [node.platform.os == linux] + + portainer: + image: portainer/portainer-ce:{{ portainer_version }} + command: -H tcp://portainer_agent:9001 --tlsskipverify + volumes: + - {{ data_dir }}/data:/data + networks: + - {{ traefik_public_net }} + deploy: + mode: replicated + replicas: 1 + labels: + - "traefik.enable=true" + - "traefik.swarm.network={{ traefik_public_net }}" + - "traefik.http.routers.portainer.rule=Host(`{{ subdomain }}.{{ main_domain }}`)" + - "traefik.http.routers.portainer.entrypoints=https" + - "traefik.http.routers.portainer.tls=true" + - "traefik.http.routers.portainer.tls.certresolver=main" + - "traefik.http.services.portainer.loadbalancer.server.port=9000" + +networks: + {{ traefik_public_net }}: + external: true \ No newline at end of file diff --git a/iac/ansible/roles/portainer/vars/main.yml b/iac/ansible/roles/portainer/vars/main.yml new file mode 100644 index 0000000..bcbba19 --- /dev/null +++ b/iac/ansible/roles/portainer/vars/main.yml @@ -0,0 +1,4 @@ +subdomain: port +data_dir: "{{ ceph_volume }}/portainer" + +portainer_version: 2.33.5 \ No newline at end of file diff --git a/iac/ansible/roles/signoz-infra/tasks/main.yml b/iac/ansible/roles/signoz-infra/tasks/main.yml new file mode 100644 index 0000000..094f0fb --- /dev/null +++ b/iac/ansible/roles/signoz-infra/tasks/main.yml @@ -0,0 +1,18 @@ +--- +- name: Copy Stack Files + copy: + directory_mode: true + src: /Users/d3r0/dev/repositories/active/gc/iac/ansible/resources/signoz-infra + dest: /mnt/cephfs + delegate_to: "{{ groups['managers'][0] }}" + run_once: true + +- name: Deploy Signoz Infra stack + community.docker.docker_stack: + state: present + name: signoz-infra + prune: true + compose: + - /mnt/cephfs/signoz-infra/signoz-infra.yml + delegate_to: "{{ groups['managers'][0] }}" + run_once: true diff --git a/iac/ansible/roles/signoz/tasks/main.yml b/iac/ansible/roles/signoz/tasks/main.yml new file mode 100644 index 0000000..34e3ca7 --- /dev/null +++ b/iac/ansible/roles/signoz/tasks/main.yml @@ -0,0 +1,18 @@ +--- +- name: Copy Stack Files + copy: + directory_mode: true + src: /Users/d3r0/dev/repositories/active/gc/iac/ansible/resources/signoz + dest: /mnt/cephfs + delegate_to: "{{ groups['managers'][0] }}" + run_once: true + +- name: Deploy Signoz stack + community.docker.docker_stack: + state: present + name: signoz + prune: true + compose: + - /mnt/cephfs/signoz/signoz.yml + delegate_to: "{{ groups['managers'][0] }}" + run_once: true diff --git a/iac/ansible/roles/ssh_hardening/tasks/main.yml b/iac/ansible/roles/ssh_hardening/tasks/main.yml new file mode 100644 index 0000000..8c456d8 --- /dev/null +++ b/iac/ansible/roles/ssh_hardening/tasks/main.yml @@ -0,0 +1,30 @@ +--- +- name: SSH | Ensure privilege separation directory exists + ansible.builtin.file: + path: /run/sshd + state: directory + mode: '0755' + +- name: SSH | Root-Login nur mit Schlüssel erlauben + ansible.builtin.lineinfile: + path: /etc/ssh/sshd_config + regexp: '^#?PermitRootLogin' + line: 'PermitRootLogin prohibit-password' + validate: 'sshd -t -f %s' + notify: restart sshd + +- name: SSH | Passwort-Authentifizierung deaktivieren + ansible.builtin.lineinfile: + path: /etc/ssh/sshd_config + regexp: '^#?PasswordAuthentication' + line: 'PasswordAuthentication no' + validate: 'sshd -t -f %s' + notify: restart sshd + +- name: SSH | Leere Passwörter verbieten + ansible.builtin.lineinfile: + path: /etc/ssh/sshd_config + regexp: '^#?PermitEmptyPasswords' + line: 'PermitEmptyPasswords no' + validate: 'sshd -t -f %s' + notify: restart sshd \ No newline at end of file diff --git a/iac/ansible/roles/traefik/files/traefik/config/dynamic/http.middlewares.authentik.toml b/iac/ansible/roles/traefik/files/traefik/config/dynamic/http.middlewares.authentik.toml new file mode 100644 index 0000000..8db552f --- /dev/null +++ b/iac/ansible/roles/traefik/files/traefik/config/dynamic/http.middlewares.authentik.toml @@ -0,0 +1,18 @@ +[http] + [http.middlewares] + [http.middlewares.authentik.forwardAuth] + address = "http://authentik_server:9000/outpost.goauthentik.io/auth/traefik" + trustForwardHeader = true + authResponseHeaders = [ + "X-authentik-username", + "X-authentik-groups", + "X-authentik-email", + "X-authentik-name", + "X-authentik-uid", + "X-authentik-jwt", + "X-authentik-meta-jwks", + "X-authentik-meta-outpost", + "X-authentik-meta-provider", + "X-authentik-meta-app", + "X-authentik-meta-version" + ] \ No newline at end of file diff --git a/iac/ansible/roles/traefik/files/traefik/config/traefik.toml b/iac/ansible/roles/traefik/files/traefik/config/traefik.toml new file mode 100644 index 0000000..850bcbd --- /dev/null +++ b/iac/ansible/roles/traefik/files/traefik/config/traefik.toml @@ -0,0 +1,80 @@ +[global] + checkNewVersion = true + sendAnonymousUsage = false + +[experimental] + otlpLogs = true + +[core] + defaultRuleSyntax = "v2" + +[accessLog] + filePath = "/logs/access.log" + format = "json" + +# Enable the Dashboard +[api] + dashboard = true + +# Write out Traefik logs +[log] + level = "INFO" + format = "json" + filePath = "/logs/traefik.log" + # [log.otlp.http] + # endpoint = "http://signoz_otel-collector:4318/v1/logs" + +[entryPoints] + [entryPoints.http] + address = ":80" + [entryPoints.http.http.redirections.entryPoint] + to = "https" + scheme = "https" + + [entryPoints.https] + address = ":443" + # [entryPoints.https.http.tls] + # certResolver = "main" + +# OTel +# [tracing] +# serviceName = "traefik" +# [tracing.otlp.http] +# endpoint = "http://signoz_otel-collector:4318/v1/traces" +# [tracing.otlp.http.tls] +# insecureSkipVerify = true + +# # Metrics +# [metrics] +# addInternals = false +# [metrics.otlp] +# serviceName = "traefik" +# addEntryPointsLabels = true +# addRoutersLabels = true +# addServicesLabels = true +# [metrics.otlp.http] +# endpoint = "http://signoz_otel-collector:4318/v1/metrics" + # [metrics.otlp.grpc] + # endpoint = "monitoring_alloy:4317" + # insecure = true + +# Let's Encrypt +[certificatesResolvers.main.acme] + email = "ma@coachhamburg.com" + storage = "acme.json" + # uncomment to use staging CA for testing + # caServer = "https://acme-staging-v02.api.letsencrypt.org/directory" + # [certificatesResolvers.main.acme.tlsChallenge] + [certificatesResolvers.main.acme.dnsChallenge] + provider = "digitalocean" + # Uncomment to use HTTP validation, like a caveman! + # [certificatesResolvers.main.acme.httpChallenge] + # entryPoint = "http" + +[providers] + [providers.swarm] + endpoint = "unix:///var/run/docker.sock" + exposedByDefault = false + [providers.file] + directory = "/etc/traefik/dynamic" + watch = true diff --git a/iac/ansible/roles/traefik/files/traefik/data/acme.json b/iac/ansible/roles/traefik/files/traefik/data/acme.json new file mode 100644 index 0000000..e69de29 diff --git a/iac/ansible/roles/traefik/tasks/main.yml b/iac/ansible/roles/traefik/tasks/main.yml new file mode 100644 index 0000000..6ab1a2c --- /dev/null +++ b/iac/ansible/roles/traefik/tasks/main.yml @@ -0,0 +1,44 @@ +--- +- name: TRAEFIK | Copy Stack Files + copy: + directory_mode: true + src: traefik + dest: "{{ ceph_volume }}" + run_once: true + delegate_to: "{{ groups['managers'][0] }}" + +- name: TRAEFIK | Generate Compose file + ansible.builtin.template: + src: docker-compose.yml.j2 + dest: "{{ data_dir }}/traefik.yml" + mode: 0644 + run_once: true + delegate_to: "{{ groups['managers'][0] }}" + +- name: TRAEFIK | Ensure permissions on acme.json + ansible.builtin.file: + path: "{{ data_dir }}/data/acme.json" + mode: '0600' + run_once: true + delegate_to: "{{ groups['managers'][0] }}" + +- name: TRAEFIK | traefik_public Netzwerk erstellen + community.docker.docker_network: + name: traefik_public + driver: overlay + state: present + attachable: yes + ipam_config: + - subnet: '172.16.200.0/24' + gateway: '172.16.200.1' + run_once: true + delegate_to: "{{ groups['managers'][0] }}" + +- name: TRAEFIK | Deploy app stack + community.docker.docker_stack: + state: present + name: traefik + compose: + - "{{ data_dir }}/traefik.yml" + delegate_to: "{{ groups['managers'][0] }}" + run_once: true \ No newline at end of file diff --git a/iac/ansible/roles/traefik/templates/docker-compose.yml.j2 b/iac/ansible/roles/traefik/templates/docker-compose.yml.j2 new file mode 100644 index 0000000..e4e7c2e --- /dev/null +++ b/iac/ansible/roles/traefik/templates/docker-compose.yml.j2 @@ -0,0 +1,55 @@ +services: + app: + image: traefik:{{ traefik_version }} + ports: + - target: 80 + published: 80 + protocol: tcp + mode: host + - target: 443 + published: 443 + protocol: tcp + mode: host + - target: 8080 + published: 8080 + protocol: tcp + environment: +# - HETZNER_API_TOKEN={{ hetzner_api_key }} + - DO_AUTH_TOKEN={{ do_api_key }} + volumes: + - /var/run/docker.sock:/var/run/docker.sock:ro + - {{ data_dir }}/config:/etc/traefik + - {{ data_dir }}/data/logs:/logs + - {{ data_dir }}/data/acme.json:/acme.json + # healthcheck: + # test: ["CMD", "traefik", "healthcheck", "--ping"] + # timeout: 1s + # interval: 10s + # retries: 3 + # start_period: 10s + networks: + - {{ traefik_public_net }} + # Global mode makes an instance of traefik listen on _every_ node, so that regardless of which + # node the request arrives on, it'll be forwarded to the correct backend service. + deploy: + mode: global + labels: + - "traefik.enable=true" + - "traefik.swarm.network={{ traefik_public_net }}" + - "traefik.http.routers.api.rule=Host(`{{ subdomain }}.{{ main_domain }}`) && (PathPrefix(`/api`) || PathPrefix(`/dashboard`))" + - "traefik.http.routers.api.entrypoints=https" +{% if use_authentik %} + - "traefik.http.routers.api.middlewares=authentik@file" +{% endif %} + - "traefik.http.routers.api.tls.domains[0].main={{ main_domain }}" + - "traefik.http.routers.api.tls.domains[0].sans=*.{{ main_domain }}" + - "traefik.http.routers.api.tls=true" + - "traefik.http.routers.api.tls.certresolver=main" + - "traefik.http.routers.api.service=api@internal" + - "traefik.http.services.dummy.loadbalancer.server.port=9999" + placement: + constraints: [node.role == manager] + +networks: + {{ traefik_public_net }}: + external: true diff --git a/iac/ansible/roles/traefik/vars/main.yml b/iac/ansible/roles/traefik/vars/main.yml new file mode 100644 index 0000000..6e679d7 --- /dev/null +++ b/iac/ansible/roles/traefik/vars/main.yml @@ -0,0 +1,5 @@ +subdomain: router +use_authentik: true +data_dir: "{{ ceph_volume }}/traefik" + +traefik_version: v3.6.2 \ No newline at end of file diff --git a/iac/ansible/roles/ufw_firewall/tasks/main.yml b/iac/ansible/roles/ufw_firewall/tasks/main.yml new file mode 100644 index 0000000..e992ebb --- /dev/null +++ b/iac/ansible/roles/ufw_firewall/tasks/main.yml @@ -0,0 +1,84 @@ +--- +- name: FIREWALL | UFW auf Standardeinstellungen zurücksetzen + community.general.ufw: + state: reset + +- name: FIREWALL | Standardmäßig allen ausgehenden Traffic erlauben + community.general.ufw: + direction: outgoing + policy: allow + +- name: FIREWALL | Standardmäßig allen eingehenden Traffic blockieren + community.general.ufw: + direction: incoming + policy: deny + +- name: FIREWALL | Eingehenden SSH-Traffic auf öffentlichem Interface erlauben + community.general.ufw: + rule: allow + port: "{{ ssh_port }}" + proto: tcp + interface: "{{ public_interface }}" + direction: in + +- name: FIREWALL | Eingehenden SSH-Traffic auf privatem Interface erlauben + community.general.ufw: + rule: allow + port: "{{ ssh_port }}" + proto: tcp + interface: "{{ private_interface }}" + direction: in + +- name: FIREWALL | Eingehenden HTTP/HTTPS-Traffic auf öffentlichem Interface erlauben + community.general.ufw: + rule: allow + port: "{{ item.port }}" + proto: "{{ item.proto }}" + interface: "{{ public_interface }}" + direction: in + with_items: + - { port: '80', proto: 'tcp' } + - { port: '443', proto: 'tcp' } + +- name: FIREWALL | Ceph Monitor Ports auf privatem Interface erlauben + community.general.ufw: + rule: allow + port: "{{ item }}" + proto: tcp + interface: "{{ private_interface }}" + direction: in + with_items: + - '3300' + - '6789' + +- name: FIREWALL | Ceph OSD/MGR Port-Range auf öffentlichem Interface erlauben + community.general.ufw: + rule: allow + port: "6800:7568" + proto: tcp + interface: "{{ private_interface }}" + direction: in + +- name: FIREWALL | Docker Swarm Management Ports auf privatem Interface erlauben + community.general.ufw: + rule: allow + port: "2377" + proto: tcp + interface: "{{ private_interface }}" + direction: in + +- name: FIREWALL | Docker Swarm Discovery/Network Ports auf privatem Interface erlauben + community.general.ufw: + rule: allow + port: "{{ item.port }}" + proto: "{{ item.proto }}" + interface: "{{ private_interface }}" + direction: in + with_items: + - { port: '7946', proto: 'tcp' } + - { port: '7946', proto: 'udp' } + - { port: '4789', proto: 'udp' } + +- name: FIREWALL | UFW aktivieren + community.general.ufw: + state: enabled \ No newline at end of file diff --git a/iac/cluster/Pulumi.infra-base.yaml b/iac/cluster/Pulumi.infra-base.yaml new file mode 100644 index 0000000..171b95b --- /dev/null +++ b/iac/cluster/Pulumi.infra-base.yaml @@ -0,0 +1,3 @@ +config: + hcloud:token: + secure: AAABAHkvxBXaEbrikY6bNyuwXehFp71LvsHTT2LOYHLiAaRCil5cSODn1EktYTYL+f4ryGJtN1j/wiyrAkbZBnyVC1QnSb84tTLYeKYXBtHo2fY87vReuyOwFZbFGylC diff --git a/iac/cluster/Pulumi.yaml b/iac/cluster/Pulumi.yaml new file mode 100644 index 0000000..7393375 --- /dev/null +++ b/iac/cluster/Pulumi.yaml @@ -0,0 +1,9 @@ +name: gc-infra +description: A minimal Go Pulumi program +runtime: go +config: + pulumi:tags: + value: + pulumi:template: go + # hcloud:token: + # value: xqb89P4vF2YlBjU75AAtyoQzNvTHaXyhB0J2UYR8dAmEQDKz5GWeKO7KgEyPzUu5 diff --git a/iac/cluster/main.go b/iac/cluster/main.go index 65f2f47..9d9bcb5 100644 --- a/iac/cluster/main.go +++ b/iac/cluster/main.go @@ -16,7 +16,7 @@ import ( type Infrastructure struct { placementGroup *hcloud.PlacementGroup networkID *pulumi.IDOutput - masterNodes []*hcloud.Server + managerNodes []*hcloud.Server workerNodes []*hcloud.Server } @@ -55,30 +55,31 @@ func main() { panic(err.Error()) } - infra.masterNodes, err = utils.CreateServer(ctx, utils.CreateServerArgs{ + infra.managerNodes, err = utils.CreateServer(ctx, utils.CreateServerArgs{ PlacementGroupId: infra.placementGroup.ID(), NetworkId: infra.networkID, NetworkFirstIP: string(utils.IncrementIP(net.ParseIP("10.0.1.0"))), - Basename: "master-node", - Count: 1, - SshKey: hkey, - }) - if err != nil { - panic(err.Error()) - } - infra.workerNodes, err = utils.CreateServer(ctx, utils.CreateServerArgs{ - PlacementGroupId: infra.placementGroup.ID(), - NetworkId: infra.networkID, - NetworkFirstIP: string(utils.IncrementIP(net.ParseIP("10.0.1.20"))), - Basename: "worker-node", - Count: 2, + Basename: "manager-node", + Count: 3, SshKey: hkey, + ServerType: "ccx23", }) if err != nil { panic(err.Error()) } + // infra.workerNodes, err = utils.CreateServer(ctx, utils.CreateServerArgs{ + // PlacementGroupId: infra.placementGroup.ID(), + // NetworkId: infra.networkID, + // NetworkFirstIP: string(utils.IncrementIP(net.ParseIP("10.0.1.20"))), + // Basename: "worker-node", + // Count: 2, + // SshKey: hkey, + // }) + // if err != nil { + // panic(err.Error()) + // } - for idx, s := range slices.Concat(infra.masterNodes, infra.workerNodes) { + for idx, s := range slices.Concat(infra.managerNodes, infra.workerNodes) { err := utils.InstallAnsibleDependencies(ctx, remote.ConnectionArgs{ Host: s.Ipv4Address, User: pulumi.String("root"), @@ -89,22 +90,28 @@ func main() { } } - var advAddr = infra.masterNodes[0].Networks.ApplyT(func(net []hcloud.ServerNetworkType) string { - return *net[0].Ip - }).(pulumi.StringOutput) + // var advAddr = infra.managerNodes[0].Networks.ApplyT(func(net []hcloud.ServerNetworkType) string { + // return *net[0].Ip + // }).(pulumi.StringOutput) - tokens, err := utils.InitDockerSwarm(ctx, remote.ConnectionArgs{ - Host: infra.masterNodes[0].Ipv4Address, - User: pulumi.String("root"), - PrivateKey: pk.PrivateKeyOpenssh}, advAddr) - if err != nil { - panic(err.Error()) - } + // tokens, err := utils.InitDockerSwarm(ctx, remote.ConnectionArgs{ + // Host: infra.managerNodes[0].Ipv4Address, + // User: pulumi.String("root"), + // PrivateKey: pk.PrivateKeyOpenssh}, advAddr) + // if err != nil { + // panic(err.Error()) + // } - ctx.Export("SwarmTokens", tokens) + // ctx.Export("SwarmTokens", tokens) + + // inventory, err := utils.CreateAnsibleInventory(infra.managerNodes, infra.workerNodes) + // if err != nil { + // panic(err.Error()) + // } + // ctx.Export("inventory", inventory) sm := map[string]pulumi.Input{} - for idx, s := range slices.Concat(infra.masterNodes, infra.workerNodes) { + for idx, s := range slices.Concat(infra.managerNodes, infra.workerNodes) { sm[fmt.Sprintf("node-%d-ip", idx)] = s.Ipv4Address } ctx.Export("server-ips", pulumi.Map(sm)) diff --git a/iac/cluster/utils/ansible-configuration.go b/iac/cluster/utils/ansible-configuration.go index d54867f..ceb1dd8 100644 --- a/iac/cluster/utils/ansible-configuration.go +++ b/iac/cluster/utils/ansible-configuration.go @@ -1,11 +1,14 @@ package utils import ( + "bytes" "fmt" "regexp" "strings" + "text/template" "github.com/pulumi/pulumi-command/sdk/go/command/remote" + "github.com/pulumi/pulumi-hcloud/sdk/go/hcloud" "github.com/pulumi/pulumi/sdk/v3/go/pulumi" ) @@ -14,6 +17,11 @@ type SwarmJoinTokens struct { WorkerToken string } +type ServerInfo struct { + Name pulumi.StringOutput + IP pulumi.StringOutput +} + func InstallAnsibleDependencies(ctx *pulumi.Context, connArgs remote.ConnectionArgs, uniqueness string) error { _, err := remote.NewCommand(ctx, strings.Join([]string{uniqueness, "Install Ansible Dependencies"}, ": "), &remote.CommandArgs{ @@ -26,7 +34,7 @@ func InstallAnsibleDependencies(ctx *pulumi.Context, connArgs remote.ConnectionA return nil } -func InitDockerSwarm(ctx *pulumi.Context, connArgs remote.ConnectionArgs, advertiseAddr pulumi.StringOutput) (pulumi.StringOutput, error) { +func InitDockerSwarm(ctx *pulumi.Context, connArgs remote.ConnectionArgs, advertiseAddr pulumi.StringOutput) (pulumi.Output, error) { var tokens SwarmJoinTokens fullCommand := advertiseAddr.ApplyT(func(addr string) *string { @@ -44,17 +52,103 @@ func InitDockerSwarm(ctx *pulumi.Context, connArgs remote.ConnectionArgs, advert return pulumi.StringOutput{}, err } - return out.Stdout.ApplyT(func(output string) string { + return out.Stdout.ApplyT(func(output string) SwarmJoinTokens { searchWorker := "Worker Token: " - pattern := regexp.MustCompile(searchWorker + `(\S+)`) + patternWorker := regexp.MustCompile(searchWorker + `(\S+)`) + searchManager := "Manager Token: " + patternManager := regexp.MustCompile(searchManager + `(\S+)`) - matches := pattern.FindStringSubmatch(output) + matches := patternWorker.FindStringSubmatch(output) if len(matches) > 1 { extracted := matches[1] tokens.WorkerToken = extracted - return extracted } - fmt.Println(tokens.WorkerToken) - return "" - }).(pulumi.StringOutput), nil + matches = patternManager.FindStringSubmatch(output) + if len(matches) > 1 { + extracted := matches[1] + tokens.ManagerToken = extracted + } + return tokens + }), nil +} + +func CreateAnsibleInventory(managerNodes, workerNodes []*hcloud.Server) (pulumi.Output, error) { + serverInfos := toServerInfo(managerNodes) + return pulumi.All(pulumi.ToOutput(serverInfos)).ApplyT(func(results []interface{}) (string, error) { + var serverInfos = results[0].([]ServerInfo) + // var workerSlice = results[1].([]*hcloud.Server) + + serverData := make(map[string][]ServerInfo) + + for _, s := range serverInfos { + serverData["Manager"] = append(serverData["Manager"], ServerInfo{ + Name: s.Name, + IP: s.IP, + }) + } + // for _, result := range workerSlice { + // server := result.(map[string]interface{}) + // serverData["Worker"] = append(serverData["Worker"], ServerInfo{ + // Name: server["name"].(string), + // IP: server["ipv4_address"].(string), + // }) + // } + fmt.Println(serverData["Manager"]) + fmt.Println(results[0]) + return generateInventoryFile(serverData) + }).(pulumi.Output), nil +} + +func toServerInfo(server []*hcloud.Server) pulumi.ArrayOutput { + serverInfo := []ServerInfo{} + for _, s := range server { + serverInfo = append(serverInfo, ServerInfo{ + Name: s.Name, + IP: s.Ipv4Address, + }) + } + return pulumi.All(serverInfo).ApplyT(func(args []interface{}) []interface{} { + var serverInfo []interface{} + + for _, s := range args { + val := s.(map[string]interface{}) + serverInfo = append(serverInfo, map[string]interface{}{ + "Name": val["Name"].(string), + "IP": val["IP"].(string), + }) + } + return serverInfo + }).(pulumi.ArrayOutput) +} + +func generateInventoryFile(inventory map[string][]ServerInfo) (string, error) { + const inventoryTmpl = ` +[all] +{{ range .Manager }} +{{ .Name }} ansible_host={{ .IP }} ansible_connection=ssh ansible_user=root ansible_ssh_private_key_file=../infra-base/private_key +{{ end }} +{{ range .Worker }} +{{ .Name }} ansible_host={{ .IP }} ansible_connection=ssh ansible_user=root ansible_ssh_private_key_file=../infra-base/private_key +{{ end }} + +[manager] +{{ range .Manager }} +{{ .Name }} ansible_host={{ .IP }} ansible_connection=ssh ansible_user=root ansible_ssh_private_key_file=../infra-base/private_key +{{ end }} + +[worker] +{{ range .Worker }} +{{ .Name }} ansible_host={{ .IP }} ansible_connection=ssh ansible_user=root ansible_ssh_private_key_file=../infra-base/private_key +{{ end }} + ` + tmpl, err := template.New("inventory").Parse(inventoryTmpl) + if err != nil { + return "", err + } + var buf bytes.Buffer + err = tmpl.Execute(&buf, inventory) + if err != nil { + return "", err + } + return buf.String(), nil } diff --git a/iac/cluster/utils/resource-creation.go b/iac/cluster/utils/resource-creation.go index d62d909..75dc782 100644 --- a/iac/cluster/utils/resource-creation.go +++ b/iac/cluster/utils/resource-creation.go @@ -54,6 +54,7 @@ type CreateServerArgs struct { Basename string Count int SshKey *hcloud.SshKey + ServerType string } func CreateServer(ctx *pulumi.Context, cfg CreateServerArgs) ([]*hcloud.Server, error) { @@ -64,9 +65,8 @@ func CreateServer(ctx *pulumi.Context, cfg CreateServerArgs) ([]*hcloud.Server, s, err := hcloud.NewServer(ctx, sn, &hcloud.ServerArgs{ Name: pulumi.String(sn), Image: pulumi.String("docker-ce"), - ServerType: pulumi.String("cpx21"), - Location: pulumi.StringPtr("fsn1"), - // Datacenter: pulumi.StringPtr("fsn1"), + ServerType: pulumi.String(cfg.ServerType), + Location: pulumi.StringPtr("hel1"), Networks: hcloud.ServerNetworkTypeArray{ &hcloud.ServerNetworkTypeArgs{ NetworkId: IDtoIntOutput(cfg.NetworkId), @@ -85,6 +85,24 @@ func CreateServer(ctx *pulumi.Context, cfg CreateServerArgs) ([]*hcloud.Server, if err != nil { return nodes, err } + + cephVolume, err := hcloud.NewVolume(ctx, fmt.Sprintf("ceph-%s", sn), &hcloud.VolumeArgs{ + Name: pulumi.Sprintf("%s-ceph-vol-0%d", s.Name, i+1), + Size: pulumi.Int(100), + Location: s.Location, + }) + if err != nil { + return nodes, fmt.Errorf("couldn't create volume: %w", err) + } + + _, err = hcloud.NewVolumeAttachment(ctx, fmt.Sprintf("ceph-vol-attach-%s", sn), &hcloud.VolumeAttachmentArgs{ + VolumeId: IDtoIntOutput(cephVolume.ID()), + ServerId: IDtoIntOutput(s.ID()), + }) + if err != nil { + return nodes, fmt.Errorf("couldn't attach volume to node %d", i) + } + nodes = append(nodes, s) nextIp = IncrementIP(net.ParseIP(nextIp)).String() }