training:

dataset = images-damian2
last 2 text encoder layers unfrozen
30 epochs at 2e-6, then 20 epochs at 1e-6 (step 240 on blue graph)
cond_dropout= default

first 30 epochs: resume:

sample: