File size: 4,573 Bytes
c614b0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# LHM-500M
experiment:
    type: lrm
    seed: 42
    parent: video_human_benchmark
    child: human-lrm-500M 
model:
    # image encoder
    model_name: SapDinoLRMBHSD3_5
    encoder_type: dinov2_fusion
    encoder_model_name: "dinov2_vitl14_reg"
    encoder_feat_dim: 1024  # dinov2 embeding size 1024
    encoder_freeze: False 

    fine_encoder_type: sapiens
    fine_encoder_model_name: "./pretrained_models/sapiens/pretrained/checkpoints/sapiens_1b/sapiens_1b_epoch_173_torchscript.pt2"  # sapiens pretrained model path
    fine_encoder_feat_dim: 1536 # sapiens embeding size 1024
    fine_encoder_freeze: True 

    use_face_id: True

    # points embeddings
    # num_pcl: 10240
    latent_query_points_type: "e2e_smplx_sub1"
    pcl_dim: 1024
    facesr: True

    transformer_type: "sd3_mm_bh_cond"  # multi-modal BH attention.
    transformer_heads: 16  # 30
    transformer_dim: 1024  # 30 * 64=1920
    transformer_layers: 5 # 30
    tf_grad_ckpt: true
    encoder_grad_ckpt: true

    # for gs renderer
    human_model_path: "./pretrained_models/human_model_files"
    smplx_subdivide_num: 1
    smplx_type: "smplx_2"
    gs_query_dim: 1024
    gs_use_rgb: True
    gs_sh: 3
    dense_sample_pts: 40000  # 4,000
    gs_mlp_network_config:
        n_neurons: 512
        n_hidden_layers: 2
        activation: silu
    # gs_xyz_offset_max_step: 0.05625  # 1.8 / 32
    # gs_clip_scaling: 0.2  # avoid too large Sphere
    gs_xyz_offset_max_step: 1.  # 1.8 / 32
    gs_clip_scaling: [100, 0.01, 0.05, 3000] # [start, start_v, end_v, end] 
    expr_param_dim: 100
    shape_param_dim: 10

    fix_opacity: False 
    fix_rotation: False 
    cano_pose_type: 1  # 0 means exavatar-pose 1 indicates REC-MV pose

dataset:
    subsets:
        -   name: video_human_flame
            root_dirs: "./train_data/ClothVideo"
            meta_path:
                train: "./train_data/ClothVideo/label/valid_id_with_img_list_clean_30W.json"
                val: "./train_data/ClothVideo/label/valid_id_with_img_list_val.json"
            sample_rate: 1.0
            use_flame: True 
            src_head_size: 112
        -   name: video_human_flame_v2
            root_dirs: "./train_data/ClothVideo"
            meta_path:
                train: "./train_data/ClothVideo/label/valid_synthetic_data_train.json"
                val: "./train_data/ClothVideo/label/valid_synthetic_data_val.json"
            sample_rate: 1.0
            use_flame: True 
            src_head_size: 112
    sample_side_views: 5 
    source_image_res: 1024 
    src_head_size: 112
    render_image:
        low: 512 
        high: 512 
        region: null
    num_train_workers: 4
    multiply: 16  # dino features
    num_val_workers: 2
    pin_mem: true
    repeat_num: 1

train:
    mixed_precision: bf16  # REPLACE THIS BASED ON GPU TYPE
    find_unused_parameters: false
    loss_func:
        pixel_loss: l1  # L1 or MSE
        ball_loss:
            type: heuristic  # heuristic ball_loss 
            group: 
                head: 1.
                lower_body: 100.
                upper_body: 1000.
                hands: 10000.
        offset_loss:
            type: classical # heuristic ball_loss 
            group: 
                head: 1.
                lower_body: 1.  
                upper_body: 100.
                hands: 1000.
    loss:
        pixel_weight: 0.0
        masked_pixel_weight: 1.0
        masked_head_weight: 0.0
        perceptual_weight: 1.0
        # tv_weight: 5e-4
        tv_weight: -1
        mask_weight: 1.0
        face_id_weight: 0.05 
        asap_weight: 10.0  # ball loss
        acap_weight: 1000.0  # offset loss
    optim:
        lr: 4e-5
        weight_decay: 0.05
        beta1: 0.9
        beta2: 0.95
        clip_grad_norm: 0.1  # diffusion model
    scheduler:
        type: cosine
        warmup_real_iters: 0
    batch_size: 4  # REPLACE THIS (PER GPU)
    accum_steps: 1  # REPLACE THIS
    epochs: 60  # REPLACE THIS
    debug_global_steps: null

val:
    batch_size: 2
    global_step_period: 1000
    debug_batches: 10

saver:
    auto_resume: True 
    load_model: None
    checkpoint_root: ./exps/checkpoints
    checkpoint_global_steps: 1000
    checkpoint_keep_level: 60 

logger:
    stream_level: WARNING
    log_level: INFO
    log_root: ./exps/logs
    tracker_root: ./exps/trackers
    enable_profiler: false
    trackers:
        - tensorboard
    image_monitor:
        train_global_steps: 100
        samples_per_log: 4

compile:
    suppress_errors: true
    print_specializations: true
    disable: true