diff --git a/apps/misc/nvidia.yaml b/apps/misc/nvidia.yaml new file mode 100644 index 0000000..c26bd6d --- /dev/null +++ b/apps/misc/nvidia.yaml @@ -0,0 +1,5 @@ +apiVersion: node.k8s.io/v1 +kind: RuntimeClass +metadata: + name: nvidia +handler: nvidia diff --git a/apps/nvidia.yaml b/apps/nvidia.yaml new file mode 100644 index 0000000..14e8454 --- /dev/null +++ b/apps/nvidia.yaml @@ -0,0 +1,26 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: nvidia-device-plugin + namespace: argocd +spec: + project: default + destination: + server: https://kubernetes.default.svc + namespace: nvidia + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true + managedNamespaceMetadata: + labels: + pod-security.kubernetes.io/enforce: privileged + source: + repoURL: https://nvidia.github.io/k8s-device-plugin + targetRevision: 0.13.0 + chart: nvidia-device-plugin + helm: + valuesObject: + runtimeClassName: nvidia diff --git a/talconfig.yaml b/talconfig.yaml index d23f229..a0eee5f 100644 --- a/talconfig.yaml +++ b/talconfig.yaml @@ -20,6 +20,16 @@ patches: kernel: modules: - name: zfs + - |- + machine: + kernel: + modules: + - name: nvidia + - name: nvidia_uvm + - name: nvidia_drm + - name: nvidia_modeset + sysctls: + net.core.bpf_jit_harden: 1 nodes: - hostname: kadan ipAddress: kube.sdg.moe @@ -34,3 +44,5 @@ nodes: systemExtensions: officialExtensions: - siderolabs/zfs + - siderolabs/nonfree-kmod-nvidia-lts + - siderolabs/nvidia-container-toolkit-lts