Publications
Search
Chang, Di; Xu, Hongyi; Xie, You; Gao, Yipeng; Kuang, Zhengfei; Cai, Shengqu; Zhang, Chenxu; Song, Guoxian; Wang, Chao; Shi, Yichun; Chen, Zeyuan; Zhou, Shijie; Luo, Linjie; Wetzstein, Gordon; Soleymani, Mohammad
X-Dyna: Expressive Dynamic Human Image Animation Miscellaneous
2025, (arXiv:2501.10021 [cs]).
@misc{chang_x-dyna_2025,
title = {X-Dyna: Expressive Dynamic Human Image Animation},
author = {Di Chang and Hongyi Xu and You Xie and Yipeng Gao and Zhengfei Kuang and Shengqu Cai and Chenxu Zhang and Guoxian Song and Chao Wang and Yichun Shi and Zeyuan Chen and Shijie Zhou and Linjie Luo and Gordon Wetzstein and Mohammad Soleymani},
url = {http://arxiv.org/abs/2501.10021},
doi = {10.48550/arXiv.2501.10021},
year = {2025},
date = {2025-01-01},
urldate = {2025-02-20},
publisher = {arXiv},
abstract = {We introduce X-Dyna, a novel zero-shot, diffusion-based pipeline for animating a single human image using facial expressions and body movements derived from a driving video, that generates realistic, context-aware dynamics for both the subject and the surrounding environment. Building on prior approaches centered on human pose control, X-Dyna addresses key shortcomings causing the loss of dynamic details, enhancing the lifelike qualities of human video animations. At the core of our approach is the Dynamics-Adapter, a lightweight module that effectively integrates reference appearance context into the spatial attentions of the diffusion backbone while preserving the capacity of motion modules in synthesizing fluid and intricate dynamic details. Beyond body pose control, we connect a local control module with our model to capture identity-disentangled facial expressions, facilitating accurate expression transfer for enhanced realism in animated scenes. Together, these components form a unified framework capable of learning physical human motion and natural scene dynamics from a diverse blend of human and scene videos. Comprehensive qualitative and quantitative evaluations demonstrate that X-Dyna outperforms state-of-the-art methods, creating highly lifelike and expressive animations. The code is available at https://github.com/bytedance/X-Dyna.},
note = {arXiv:2501.10021 [cs]},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Hu, Yue; Liu, Rong; Chen, Meida; Beerel, Peter; Feng, Andrew
SplatMAP: Online Dense Monocular SLAM with 3D Gaussian Splatting Miscellaneous
2025, (arXiv:2501.07015 [cs]).
@misc{hu_splatmap_2025,
title = {SplatMAP: Online Dense Monocular SLAM with 3D Gaussian Splatting},
author = {Yue Hu and Rong Liu and Meida Chen and Peter Beerel and Andrew Feng},
url = {http://arxiv.org/abs/2501.07015},
doi = {10.48550/arXiv.2501.07015},
year = {2025},
date = {2025-01-01},
urldate = {2025-01-16},
publisher = {arXiv},
abstract = {Achieving high-fidelity 3D reconstruction from monocular video remains challenging due to the inherent limitations of traditional methods like Structure-from-Motion (SfM) and monocular SLAM in accurately capturing scene details. While differentiable rendering techniques such as Neural Radiance Fields (NeRF) address some of these challenges, their high computational costs make them unsuitable for real-time applications. Additionally, existing 3D Gaussian Splatting (3DGS) methods often focus on photometric consistency, neglecting geometric accuracy and failing to exploit SLAM's dynamic depth and pose updates for scene refinement. We propose a framework integrating dense SLAM with 3DGS for real-time, high-fidelity dense reconstruction. Our approach introduces SLAM-Informed Adaptive Densification, which dynamically updates and densifies the Gaussian model by leveraging dense point clouds from SLAM. Additionally, we incorporate Geometry-Guided Optimization, which combines edge-aware geometric constraints and photometric consistency to jointly optimize the appearance and geometry of the 3DGS scene representation, enabling detailed and accurate SLAM mapping reconstruction. Experiments on the Replica and TUM-RGBD datasets demonstrate the effectiveness of our approach, achieving state-of-the-art results among monocular systems. Specifically, our method achieves a PSNR of 36.864, SSIM of 0.985, and LPIPS of 0.040 on Replica, representing improvements of 10.7%, 6.4%, and 49.4%, respectively, over the previous SOTA. On TUM-RGBD, our method outperforms the closest baseline by 10.2%, 6.6%, and 34.7% in the same metrics. These results highlight the potential of our framework in bridging the gap between photometric and geometric dense 3D scene representations, paving the way for practical and efficient monocular dense reconstruction.},
note = {arXiv:2501.07015 [cs]},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Chen, Meida; Han, Kangle; Yu, Zifan; Feng, Andrew; Hou, Yu; You, Suya; Soibelman, Lucio
An Aerial Photogrammetry Benchmark Dataset for Point Cloud Segmentation and Style Translation Journal Article
In: Remote Sensing, vol. 16, no. 22, pp. 4240, 2024, ISSN: 2072-4292.
@article{chen_aerial_2024,
title = {An Aerial Photogrammetry Benchmark Dataset for Point Cloud Segmentation and Style Translation},
author = {Meida Chen and Kangle Han and Zifan Yu and Andrew Feng and Yu Hou and Suya You and Lucio Soibelman},
url = {https://www.mdpi.com/2072-4292/16/22/4240},
doi = {10.3390/rs16224240},
issn = {2072-4292},
year = {2024},
date = {2024-11-01},
urldate = {2024-12-05},
journal = {Remote Sensing},
volume = {16},
number = {22},
pages = {4240},
abstract = {The recent surge in diverse 3D datasets spanning various scales and applications marks a significant advancement in the field. However, the comprehensive process of data acquisition, refinement, and annotation at a large scale poses a formidable challenge, particularly for individual researchers and small teams. To this end, we present a novel synthetic 3D point cloud generation framework that can produce detailed outdoor aerial photogrammetric 3D datasets with accurate ground truth annotations without the labor-intensive and time-consuming data collection/annotation processes. Our pipeline procedurally generates synthetic environments, mirroring real-world data collection and 3D reconstruction processes. A key feature of our framework is its ability to replicate consistent quality, noise patterns, and diversity similar to real-world datasets. This is achieved by adopting UAV flight patterns that resemble those used in real-world data collection processes (e.g., the cross-hatch flight pattern) across various synthetic terrains that are procedurally generated, thereby ensuring data consistency akin to real-world scenarios. Moreover, the generated datasets are enriched with precise semantic and instance annotations, eliminating the need for manual labeling. Our approach has led to the development and release of the Semantic Terrain Points Labeling—Synthetic 3D (STPLS3D) benchmark, an extensive outdoor 3D dataset encompassing over 16 km2, featuring up to 19 semantic labels. We also collected, reconstructed, and annotated four real-world datasets for validation purposes. Extensive experiments on these datasets demonstrate our synthetic datasets’ effectiveness, superior quality, and their value as a benchmark dataset for further point cloud research.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Xiao, Hanyuan; Chen, Yingshu; Huang, Huajian; Xiong, Haolin; Yang, Jing; Prasad, Pratusha; Zhao, Yajie
Localized Gaussian Splatting Editing with Contextual Awareness Miscellaneous
2024, (arXiv:2408.00083 [cs]).
@misc{xiao_localized_2024,
title = {Localized Gaussian Splatting Editing with Contextual Awareness},
author = {Hanyuan Xiao and Yingshu Chen and Huajian Huang and Haolin Xiong and Jing Yang and Pratusha Prasad and Yajie Zhao},
url = {http://arxiv.org/abs/2408.00083},
year = {2024},
date = {2024-07-01},
urldate = {2024-08-16},
publisher = {arXiv},
abstract = {Recent text-guided generation of individual 3D object has achieved great success using diffusion priors. However, these methods are not suitable for object insertion and replacement tasks as they do not consider the background, leading to illumination mismatches within the environment. To bridge the gap, we introduce an illumination-aware 3D scene editing pipeline for 3D Gaussian Splatting (3DGS) representation. Our key observation is that inpainting by the state-of-the-art conditional 2D diffusion model is consistent with background in lighting. To leverage the prior knowledge from the well-trained diffusion models for 3D object generation, our approach employs a coarse-to-fine objection optimization pipeline with inpainted views. In the first coarse step, we achieve image-to-3D lifting given an ideal inpainted view. The process employs 3D-aware diffusion prior from a view-conditioned diffusion model, which preserves illumination present in the conditioning image. To acquire an ideal inpainted image, we introduce an Anchor View Proposal (AVP) algorithm to find a single view that best represents the scene illumination in target region. In the second Texture Enhancement step, we introduce a novel Depth-guided Inpainting Score Distillation Sampling (DI-SDS), which enhances geometry and texture details with the inpainting diffusion prior, beyond the scope of the 3D-aware diffusion prior knowledge in the first coarse step. DI-SDS not only provides fine-grained texture enhancement, but also urges optimization to respect scene lighting. Our approach efficiently achieves local editing with global illumination consistency without explicitly modeling light transport. We demonstrate robustness of our method by evaluating editing in real scenes containing explicit highlight and shadows, and compare against the state-of-the-art text-to-3D editing methods.},
note = {arXiv:2408.00083 [cs]},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Chen, Meida; Lal, Devashish; Yu, Zifan; Xu, Jiuyi; Feng, Andrew; You, Suya; Nurunnabi, Abdul; Shi, Yangming
Large-Scale 3D Terrain Reconstruction Using 3D Gaussian Splatting for Visualization and Simulation Journal Article
In: Int. Arch. Photogramm. Remote Sens. Spatial Inf. Sci., vol. XLVIII-2-2024, pp. 49–54, 2024, ISSN: 2194-9034.
@article{chen_large-scale_2024,
title = {Large-Scale 3D Terrain Reconstruction Using 3D Gaussian Splatting for Visualization and Simulation},
author = {Meida Chen and Devashish Lal and Zifan Yu and Jiuyi Xu and Andrew Feng and Suya You and Abdul Nurunnabi and Yangming Shi},
url = {https://isprs-archives.copernicus.org/articles/XLVIII-2-2024/49/2024/},
doi = {10.5194/isprs-archives-XLVIII-2-2024-49-2024},
issn = {2194-9034},
year = {2024},
date = {2024-06-01},
urldate = {2024-06-20},
journal = {Int. Arch. Photogramm. Remote Sens. Spatial Inf. Sci.},
volume = {XLVIII-2-2024},
pages = {49–54},
abstract = {Abstract. The fusion of low-cost unmanned aerial systems (UAS) with advanced photogrammetric techniques has revolutionized 3D terrain reconstruction, enabling the automated creation of detailed models. Concurrently, the advent of 3D Gaussian Splatting has introduced a paradigm shift in 3D data representation, offering visually realistic renditions distinct from traditional polygon-based models. Our research builds upon this foundation, aiming to integrate Gaussian Splatting into interactive simulations for immersive virtual environments. We address challenges such as collision detection by adopting a hybrid approach, combining Gaussian Splatting with photogrammetry-derived meshes. Through comprehensive experimentation covering varying terrain sizes and Gaussian densities, we evaluate scalability, performance, and limitations. Our findings contribute to advancing the use of advanced computer graphics techniques for enhanced 3D terrain visualization and simulation.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Zhang, Mingyuan; Cai, Zhongang; Pan, Liang; Hong, Fangzhou; Guo, Xinying; Yang, Lei; Liu, Ziwei
MotionDiffuse: Text-Driven Human Motion Generation With Diffusion Model Journal Article
In: IEEE Trans. Pattern Anal. Mach. Intell., vol. 46, no. 6, pp. 4115–4128, 2024, ISSN: 0162-8828, 2160-9292, 1939-3539.
@article{zhang_motiondiffuse_2024,
title = {MotionDiffuse: Text-Driven Human Motion Generation With Diffusion Model},
author = {Mingyuan Zhang and Zhongang Cai and Liang Pan and Fangzhou Hong and Xinying Guo and Lei Yang and Ziwei Liu},
url = {https://ieeexplore.ieee.org/document/10416192/},
doi = {10.1109/TPAMI.2024.3355414},
issn = {0162-8828, 2160-9292, 1939-3539},
year = {2024},
date = {2024-06-01},
urldate = {2024-07-18},
journal = {IEEE Trans. Pattern Anal. Mach. Intell.},
volume = {46},
number = {6},
pages = {4115–4128},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Nurunnabi, Abdul; Teferle, Felicia; Laefer, Debra F.; Chen, Meida; Ali, Mir Masoom
Development of a Precise Tree Structure from LiDAR Point Clouds Journal Article
In: Int. Arch. Photogramm. Remote Sens. Spatial Inf. Sci., vol. XLVIII-2-2024, pp. 301–308, 2024, ISSN: 2194-9034.
@article{nurunnabi_development_2024,
title = {Development of a Precise Tree Structure from LiDAR Point Clouds},
author = {Abdul Nurunnabi and Felicia Teferle and Debra F. Laefer and Meida Chen and Mir Masoom Ali},
url = {https://isprs-archives.copernicus.org/articles/XLVIII-2-2024/301/2024/},
doi = {10.5194/isprs-archives-XLVIII-2-2024-301-2024},
issn = {2194-9034},
year = {2024},
date = {2024-06-01},
urldate = {2024-07-11},
journal = {Int. Arch. Photogramm. Remote Sens. Spatial Inf. Sci.},
volume = {XLVIII-2-2024},
pages = {301–308},
abstract = {Abstract. A precise tree structure that represents the distribution of tree stem, branches, and leaves is crucial for accurately capturing the full representation of a tree. Light Detection and Ranging (LiDAR)-based three-dimensional (3D) point clouds (PCs) capture the geometry of scanned objects including forests stands and individual trees. PCs are irregular, unstructured, often noisy, and contaminated by outliers. Researchers have struggled to develop methods to separate leaves and wood without losing the tree geometry. This paper proposes a solution that employs only the spatial coordinates (x, y, z) of the PC. The new algorithm works as a filtering approach, utilizing multi-scale neighborhood-based geometric features (GFs) e.g., linearity, planarity, and verticality to classify linear (wood) and non-linear (leaf) points. This involves finding potential wood points and coupling them with an octree-based segmentation to develop a tree architecture. The main contributions of this paper are (i) investigating the potential of different GFs to split linear and non-linear points, (ii) introducing a novel method that pointwise classifies leaf and wood points, and (iii) developing a precise 3D tree structure. The performance of the new algorithm has been demonstrated through terrestrial laser scanning PCs. For a Scots pine tree, the new method classifies leaf and wood points with an overall accuracy of 97.9%.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Zhang, Hao; Chang, Di; Li, Fang; Soleymani, Mohammad; Ahuja, Narendra
MagicPose4D: Crafting Articulated Models with Appearance and Motion Control Miscellaneous
2024, (Version Number: 1).
@misc{zhang_magicpose4d_2024,
title = {MagicPose4D: Crafting Articulated Models with Appearance and Motion Control},
author = {Hao Zhang and Di Chang and Fang Li and Mohammad Soleymani and Narendra Ahuja},
url = {https://arxiv.org/abs/2405.14017},
doi = {10.48550/ARXIV.2405.14017},
year = {2024},
date = {2024-05-01},
urldate = {2024-06-25},
publisher = {arXiv},
abstract = {With the success of 2D and 3D visual generative models, there is growing interest in generating 4D content. Existing methods primarily rely on text prompts to produce 4D content, but they often fall short of accurately defining complex or rare motions. To address this limitation, we propose MagicPose4D, a novel framework for refined control over both appearance and motion in 4D generation. Unlike traditional methods, MagicPose4D accepts monocular videos as motion prompts, enabling precise and customizable motion generation. MagicPose4D comprises two key modules:
i) Dual-Phase 4D Reconstruction Modulevphantom which operates in two phases. The first phase focuses on capturing the model's shape using accurate 2D supervision and less accurate but geometrically informative 3D pseudo-supervision without imposing skeleton constraints. The second phase refines the model using more accurate pseudo-3D supervision, obtained in the first phase and introduces kinematic chain-based skeleton constraints to ensure physical plausibility. Additionally, we propose a Global-local Chamfer loss that aligns the overall distribution of predicted mesh vertices with the supervision while maintaining part-level alignment without extra annotations.
ii) Cross-category Motion Transfer Modulevphantom leverages the predictions from the 4D reconstruction module and uses a kinematic-chain-based skeleton to achieve cross-category motion transfer. It ensures smooth transitions between frames through dynamic rigidity, facilitating robust generalization without additional training.
Through extensive experiments, we demonstrate that MagicPose4D significantly improves the accuracy and consistency of 4D content generation, outperforming existing methods in various benchmarks.},
note = {Version Number: 1},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
i) Dual-Phase 4D Reconstruction Modulevphantom which operates in two phases. The first phase focuses on capturing the model's shape using accurate 2D supervision and less accurate but geometrically informative 3D pseudo-supervision without imposing skeleton constraints. The second phase refines the model using more accurate pseudo-3D supervision, obtained in the first phase and introduces kinematic chain-based skeleton constraints to ensure physical plausibility. Additionally, we propose a Global-local Chamfer loss that aligns the overall distribution of predicted mesh vertices with the supervision while maintaining part-level alignment without extra annotations.
ii) Cross-category Motion Transfer Modulevphantom leverages the predictions from the 4D reconstruction module and uses a kinematic-chain-based skeleton to achieve cross-category motion transfer. It ensures smooth transitions between frames through dynamic rigidity, facilitating robust generalization without additional training.
Through extensive experiments, we demonstrate that MagicPose4D significantly improves the accuracy and consistency of 4D content generation, outperforming existing methods in various benchmarks.
Liu, Rong; Xu, Rui; Hu, Yue; Chen, Meida; Feng, Andrew
AtomGS: Atomizing Gaussian Splatting for High-Fidelity Radiance Field Miscellaneous
2024, (Version Number: 2).
@misc{liu_atomgs_2024,
title = {AtomGS: Atomizing Gaussian Splatting for High-Fidelity Radiance Field},
author = {Rong Liu and Rui Xu and Yue Hu and Meida Chen and Andrew Feng},
url = {https://arxiv.org/abs/2405.12369},
doi = {10.48550/ARXIV.2405.12369},
year = {2024},
date = {2024-05-01},
urldate = {2024-07-11},
publisher = {arXiv},
abstract = {3D Gaussian Splatting (3DGS) has recently advanced radiance field reconstruction by offering superior capabilities for novel view synthesis and real-time rendering speed. However, its strategy of blending optimization and adaptive density control might lead to sub-optimal results; it can sometimes yield noisy geometry and blurry artifacts due to prioritizing optimizing large Gaussians at the cost of adequately densifying smaller ones. To address this, we introduce AtomGS, consisting of Atomized Proliferation and Geometry-Guided Optimization. The Atomized Proliferation constrains ellipsoid Gaussians of various sizes into more uniform-sized Atom Gaussians. The strategy enhances the representation of areas with fine features by placing greater emphasis on densification in accordance with scene details. In addition, we proposed a Geometry-Guided Optimization approach that incorporates an Edge-Aware Normal Loss. This optimization method effectively smooths flat surfaces while preserving intricate details. Our evaluation shows that AtomGS outperforms existing state-of-the-art methods in rendering quality. Additionally, it achieves competitive accuracy in geometry reconstruction and offers a significant improvement in training speed over other SDF-based methods. More interactive demos can be found in our website (https://rongliu-leo.github.io/AtomGS/).},
note = {Version Number: 2},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Zhang, Hui; Kuang, Bingran; Zhao, Yajie
Camera Calibration using a Single View of a Symmetric Object Proceedings Article
In: ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 2705–2709, IEEE, Seoul, Korea, Republic of, 2024, ISBN: 979-8-3503-4485-1.
@inproceedings{zhang_camera_2024,
title = {Camera Calibration using a Single View of a Symmetric Object},
author = {Hui Zhang and Bingran Kuang and Yajie Zhao},
url = {https://ieeexplore.ieee.org/document/10446005/},
doi = {10.1109/ICASSP48485.2024.10446005},
isbn = {979-8-3503-4485-1},
year = {2024},
date = {2024-04-01},
urldate = {2024-06-25},
booktitle = {ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages = {2705–2709},
publisher = {IEEE},
address = {Seoul, Korea, Republic of},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Chen, Haiwei; Zhao, Yajie
Don't Look into the Dark: Latent Codes for Pluralistic Image Inpainting Miscellaneous
2024, (arXiv:2403.18186 [cs]).
@misc{chen_dont_2024,
title = {Don't Look into the Dark: Latent Codes for Pluralistic Image Inpainting},
author = {Haiwei Chen and Yajie Zhao},
url = {http://arxiv.org/abs/2403.18186},
year = {2024},
date = {2024-03-01},
urldate = {2024-08-15},
publisher = {arXiv},
abstract = {We present a method for large-mask pluralistic image inpainting based on the generative framework of discrete latent codes. Our method learns latent priors, discretized as tokens, by only performing computations at the visible locations of the image. This is realized by a restrictive partial encoder that predicts the token label for each visible block, a bidirectional transformer that infers the missing labels by only looking at these tokens, and a dedicated synthesis network that couples the tokens with the partial image priors to generate coherent and pluralistic complete image even under extreme mask settings. Experiments on public benchmarks validate our design choices as the proposed method outperforms strong baselines in both visual quality and diversity metrics.},
note = {arXiv:2403.18186 [cs]},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Yang, Jing; Xiao, Hanyuan; Teng, Wenbin; Cai, Yunxuan; Zhao, Yajie
Light Sampling Field and BRDF Representation for Physically-based Neural Rendering Journal Article
In: 2023, (Publisher: arXiv Version Number: 1).
@article{yang_light_2023,
title = {Light Sampling Field and BRDF Representation for Physically-based Neural Rendering},
author = {Jing Yang and Hanyuan Xiao and Wenbin Teng and Yunxuan Cai and Yajie Zhao},
url = {https://arxiv.org/abs/2304.05472},
doi = {10.48550/ARXIV.2304.05472},
year = {2023},
date = {2023-04-01},
urldate = {2023-08-22},
abstract = {Physically-based rendering (PBR) is key for immersive rendering effects used widely in the industry to showcase detailed realistic scenes from computer graphics assets. A well-known caveat is that producing the same is computationally heavy and relies on complex capture devices. Inspired by the success in quality and efficiency of recent volumetric neural rendering, we want to develop a physically-based neural shader to eliminate device dependency and significantly boost performance. However, no existing lighting and material models in the current neural rendering approaches can accurately represent the comprehensive lighting models and BRDFs properties required by the PBR process. Thus, this paper proposes a novel lighting representation that models direct and indirect light locally through a light sampling strategy in a learned light sampling field. We also propose BRDF models to separately represent surface/subsurface scattering details to enable complex objects such as translucent material (i.e., skin, jade). We then implement our proposed representations with an end-to-end physically-based neural face skin shader, which takes a standard face asset (i.e., geometry, albedo map, and normal map) and an HDRI for illumination as inputs and generates a photo-realistic rendering as output. Extensive experiments showcase the quality and efficiency of our PBR face skin shader, indicating the effectiveness of our proposed lighting and material representations.},
note = {Publisher: arXiv
Version Number: 1},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Chemburkar, Ankur; Lu, Shuhong; Feng, Andrew
MoDDM: Text-to-Motion Synthesis using Discrete Diffusion Model Proceedings Article
In: 2023.
@inproceedings{chemburkar_moddm_2023,
title = {MoDDM: Text-to-Motion Synthesis using Discrete Diffusion Model},
author = {Ankur Chemburkar and Shuhong Lu and Andrew Feng},
url = {chrome-extension://efaidnbmnnnibpcajpcglclefindmkaj/https://papers.bmvc2023.org/0624.pdf},
year = {2023},
date = {2023-01-01},
abstract = {We present the motion discrete diffusion model (MoDDM) for synthesizing human motion from text descriptions that addresses challenges in cross-modal mapping and motion diversity. The previous methods that utilized variational autoencoder (VAE) to learn the latent distributions for text-to-motion synthesis tend to produce motions with less diversity and fidelity. While the diffusion models show promising results by generating high quality motions, they require higher computational costs and may produce motions less aligned with the input text. The proposed method combines the discrete latent space and diffusion models to learn an expressive conditional probabilistic mapping for motion synthesis. Our method utilizes vector quantization variational autoencoder (VQ-VAE) to learn discrete motion tokens and then applies discrete denoising diffusion probabilistic models (D3PM) to learn the conditional probability distributions for the motion tokens. The discrete classifier-free guidance is further utilized in the training process with proper guidance scale for aligning the motions and the corresponding text descriptions. By learning the denoising model in the discrete latent space, the method produces high quality motion results while greatly reducing computational costs compared to training the diffusion models on raw motion sequences. The evaluation results show that the proposed approach outperforms previous methods in both motion quality and text-to-motion matching accuracy.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Liu, Shichen; Cai, Yunxuan; Chen, Haiwei; Zhou, Yichao; Zhao, Yajie
Rapid Face Asset Acquisition with Recurrent Feature Alignment Journal Article
In: ACM Trans. Graph., vol. 41, no. 6, pp. 214:1–214:17, 2022, ISSN: 0730-0301.
@article{liu_rapid_2022,
title = {Rapid Face Asset Acquisition with Recurrent Feature Alignment},
author = {Shichen Liu and Yunxuan Cai and Haiwei Chen and Yichao Zhou and Yajie Zhao},
url = {https://dl.acm.org/doi/10.1145/3550454.3555509},
doi = {10.1145/3550454.3555509},
issn = {0730-0301},
year = {2022},
date = {2022-11-01},
urldate = {2023-03-31},
journal = {ACM Trans. Graph.},
volume = {41},
number = {6},
pages = {214:1–214:17},
abstract = {We present Recurrent Feature Alignment (ReFA), an end-to-end neural network for the very rapid creation of production-grade face assets from multi-view images. ReFA is on par with the industrial pipelines in quality for producing accurate, complete, registered, and textured assets directly applicable to physically-based rendering, but produces the asset end-to-end, fully automatically at a significantly faster speed at 4.5 FPS, which is unprecedented among neural-based techniques. Our method represents face geometry as a position map in the UV space. The network first extracts per-pixel features in both the multi-view image space and the UV space. A recurrent module then iteratively optimizes the geometry by projecting the image-space features to the UV space and comparing them with a reference UV-space feature. The optimized geometry then provides pixel-aligned signals for the inference of high-resolution textures. Experiments have validated that ReFA achieves a median error of 0.603mm in geometry reconstruction, is robust to extreme pose and expression, and excels in sparse-view settings. We believe that the progress achieved by our network enables lightweight, fast face assets acquisition that significantly boosts the downstream applications, such as avatar creation and facial performance capture. It will also enable massive database capturing for deep learning purposes.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Kuang, Zhengfei; Li, Jiaman; He, Mingming; Wang, Tong; Zhao, Yajie
DenseGAP: Graph-Structured Dense Correspondence Learning with Anchor Points Proceedings Article
In: pp. 542–549, IEEE Computer Society, 2022, ISBN: 978-1-6654-9062-7.
@inproceedings{kuang_densegap_2022,
title = {DenseGAP: Graph-Structured Dense Correspondence Learning with Anchor Points},
author = {Zhengfei Kuang and Jiaman Li and Mingming He and Tong Wang and Yajie Zhao},
url = {https://www.computer.org/csdl/proceedings-article/icpr/2022/09956472/1IHpppIuqOc},
doi = {10.1109/ICPR56361.2022.9956472},
isbn = {978-1-6654-9062-7},
year = {2022},
date = {2022-08-01},
urldate = {2023-03-31},
pages = {542–549},
publisher = {IEEE Computer Society},
abstract = {Establishing dense correspondence between two images is a fundamental computer vision problem, which is typically tackled by matching local feature descriptors. However, without global awareness, such local features are often insufficient for disambiguating similar regions. And computing the pairwise feature correlation across images is both computation-expensive and memory-intensive. To make the local features aware of the global context and improve their matching accuracy, we introduce DenseGAP, a new solution for efficient Dense correspondence learning with a Graph-structured neural network conditioned on Anchor Points. Specifically, we first propose a graph structure that utilizes anchor points to provide sparse but reliable prior on inter- and intra-image context and propagates them to all image points via directed edges. We also design a graph-structured network to broadcast multi-level contexts via light-weighted message-passing layers and generate high-resolution feature maps at low memory cost. Finally, based on the predicted feature maps, we introduce a coarse-to-fine framework for accurate correspondence prediction using cycle consistency. Our feature descriptors capture both local and global information, thus enabling a continuous feature field for querying arbitrary points at high resolution. Through comprehensive ablative experiments and evaluations on large-scale indoor and outdoor datasets, we demonstrate that our method advances the state-of-the-art of correspondence learning on most benchmarks.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Chen, Haiwei; Liu, Jiayi; Chen, Weikai; Liu, Shichen; Zhao, Yajie
Exemplar-based Pattern Synthesis with Implicit Periodic Field Network Proceedings Article
In: 2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3698–3707, IEEE, New Orleans, LA, USA, 2022, ISBN: 978-1-6654-6946-3.
@inproceedings{chen_exemplar-based_2022,
title = {Exemplar-based Pattern Synthesis with Implicit Periodic Field Network},
author = {Haiwei Chen and Jiayi Liu and Weikai Chen and Shichen Liu and Yajie Zhao},
url = {https://ieeexplore.ieee.org/document/9879904/},
doi = {10.1109/CVPR52688.2022.00369},
isbn = {978-1-6654-6946-3},
year = {2022},
date = {2022-06-01},
urldate = {2023-02-10},
booktitle = {2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
pages = {3698–3707},
publisher = {IEEE},
address = {New Orleans, LA, USA},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Danieau, Fabien; Guillotel, Philippe; Hoyet, Ludovic; Tonneau, Steve; Zhao, Yajie
Editorial: Creating Lifelike Digital Humans Journal Article
In: Front. Virtual Real., vol. 3, pp. 906118, 2022, ISSN: 2673-4192.
@article{danieau_editorial_2022,
title = {Editorial: Creating Lifelike Digital Humans},
author = {Fabien Danieau and Philippe Guillotel and Ludovic Hoyet and Steve Tonneau and Yajie Zhao},
url = {https://www.frontiersin.org/articles/10.3389/frvir.2022.906118/full},
doi = {10.3389/frvir.2022.906118},
issn = {2673-4192},
year = {2022},
date = {2022-04-01},
urldate = {2024-08-13},
journal = {Front. Virtual Real.},
volume = {3},
pages = {906118},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Liu, Shichen; Li, Tianye; Chen, Weikai; Li, Hao
A General Differentiable Mesh Renderer for Image-Based 3D Reasoning Journal Article
In: IEEE Transactions on Pattern Analysis and Machine Intelligence, vol. 44, no. 1, pp. 50–62, 2022, ISSN: 1939-3539, (Conference Name: IEEE Transactions on Pattern Analysis and Machine Intelligence).
@article{liu_general_2022,
title = {A General Differentiable Mesh Renderer for Image-Based 3D Reasoning},
author = {Shichen Liu and Tianye Li and Weikai Chen and Hao Li},
doi = {10.1109/TPAMI.2020.3007759},
issn = {1939-3539},
year = {2022},
date = {2022-01-01},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
volume = {44},
number = {1},
pages = {50–62},
abstract = {Rendering bridges the gap between 2D vision and 3D scenes by simulating the physical process of image formation. By inverting such renderer, one can think of a learning approach to infer 3D information from 2D images. However, standard graphics renderers involve a fundamental step called rasterization, which prevents rendering to be differentiable. Unlike the state-of-the-art differentiable renderers (Kato et al. 2018 and Loper 2018), which only approximate the rendering gradient in the backpropagation, we propose a natually differentiable rendering framework that is able to (1) directly render colorized mesh using differentiable functions and (2) back-propagate efficient supervisions to mesh vertices and their attributes from various forms of image representations. The key to our framework is a novel formulation that views rendering as an aggregation function that fuses the probabilistic contributions of all mesh triangles with respect to the rendered pixels. Such formulation enables our framework to flow gradients to the occluded and distant vertices, which cannot be achieved by the previous state-of-the-arts. We show that by using the proposed renderer, one can achieve significant improvement in 3D unsupervised single-view reconstruction both qualitatively and quantitatively. Experiments also demonstrate that our approach can handle the challenging tasks in image-based shape fitting, which remain nontrivial to existing differentiable renders.},
note = {Conference Name: IEEE Transactions on Pattern Analysis and Machine Intelligence},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Li, Jiaman; Villegas, Ruben; Ceylan, Duygu; Yang, Jimei; Kuang, Zhengfei; Li, Hao; Zhao, Yajie
Task-Generic Hierarchical Human Motion Prior using VAEs Proceedings Article
In: 2021 International Conference on 3D Vision (3DV), pp. 771–781, IEEE, London, United Kingdom, 2021, ISBN: 978-1-6654-2688-6.
@inproceedings{li_task-generic_2021,
title = {Task-Generic Hierarchical Human Motion Prior using VAEs},
author = {Jiaman Li and Ruben Villegas and Duygu Ceylan and Jimei Yang and Zhengfei Kuang and Hao Li and Yajie Zhao},
url = {https://ieeexplore.ieee.org/document/9665881/},
doi = {10.1109/3DV53792.2021.00086},
isbn = {978-1-6654-2688-6},
year = {2021},
date = {2021-12-01},
urldate = {2022-09-22},
booktitle = {2021 International Conference on 3D Vision (3DV)},
pages = {771–781},
publisher = {IEEE},
address = {London, United Kingdom},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Liu, Shichen; Zhou, Yichao; Zhao, Yajie
VaPiD: A Rapid Vanishing Point Detector via Learned Optimizers Proceedings Article
In: 2021 IEEE/CVF International Conference on Computer Vision (ICCV), pp. 12839–12848, IEEE, Montreal, QC, Canada, 2021, ISBN: 978-1-6654-2812-5.
@inproceedings{liu_vapid_2021,
title = {VaPiD: A Rapid Vanishing Point Detector via Learned Optimizers},
author = {Shichen Liu and Yichao Zhou and Yajie Zhao},
url = {https://ieeexplore.ieee.org/document/9711313/},
doi = {10.1109/ICCV48922.2021.01262},
isbn = {978-1-6654-2812-5},
year = {2021},
date = {2021-10-01},
urldate = {2022-09-22},
booktitle = {2021 IEEE/CVF International Conference on Computer Vision (ICCV)},
pages = {12839–12848},
publisher = {IEEE},
address = {Montreal, QC, Canada},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Filter
2025
Chang, Di; Xu, Hongyi; Xie, You; Gao, Yipeng; Kuang, Zhengfei; Cai, Shengqu; Zhang, Chenxu; Song, Guoxian; Wang, Chao; Shi, Yichun; Chen, Zeyuan; Zhou, Shijie; Luo, Linjie; Wetzstein, Gordon; Soleymani, Mohammad
X-Dyna: Expressive Dynamic Human Image Animation Miscellaneous
2025, (arXiv:2501.10021 [cs]).
Abstract | Links | BibTeX | Tags: VGL
@misc{chang_x-dyna_2025,
title = {X-Dyna: Expressive Dynamic Human Image Animation},
author = {Di Chang and Hongyi Xu and You Xie and Yipeng Gao and Zhengfei Kuang and Shengqu Cai and Chenxu Zhang and Guoxian Song and Chao Wang and Yichun Shi and Zeyuan Chen and Shijie Zhou and Linjie Luo and Gordon Wetzstein and Mohammad Soleymani},
url = {http://arxiv.org/abs/2501.10021},
doi = {10.48550/arXiv.2501.10021},
year = {2025},
date = {2025-01-01},
urldate = {2025-02-20},
publisher = {arXiv},
abstract = {We introduce X-Dyna, a novel zero-shot, diffusion-based pipeline for animating a single human image using facial expressions and body movements derived from a driving video, that generates realistic, context-aware dynamics for both the subject and the surrounding environment. Building on prior approaches centered on human pose control, X-Dyna addresses key shortcomings causing the loss of dynamic details, enhancing the lifelike qualities of human video animations. At the core of our approach is the Dynamics-Adapter, a lightweight module that effectively integrates reference appearance context into the spatial attentions of the diffusion backbone while preserving the capacity of motion modules in synthesizing fluid and intricate dynamic details. Beyond body pose control, we connect a local control module with our model to capture identity-disentangled facial expressions, facilitating accurate expression transfer for enhanced realism in animated scenes. Together, these components form a unified framework capable of learning physical human motion and natural scene dynamics from a diverse blend of human and scene videos. Comprehensive qualitative and quantitative evaluations demonstrate that X-Dyna outperforms state-of-the-art methods, creating highly lifelike and expressive animations. The code is available at https://github.com/bytedance/X-Dyna.},
note = {arXiv:2501.10021 [cs]},
keywords = {VGL},
pubstate = {published},
tppubtype = {misc}
}
Hu, Yue; Liu, Rong; Chen, Meida; Beerel, Peter; Feng, Andrew
SplatMAP: Online Dense Monocular SLAM with 3D Gaussian Splatting Miscellaneous
2025, (arXiv:2501.07015 [cs]).
Abstract | Links | BibTeX | Tags: VGL
@misc{hu_splatmap_2025,
title = {SplatMAP: Online Dense Monocular SLAM with 3D Gaussian Splatting},
author = {Yue Hu and Rong Liu and Meida Chen and Peter Beerel and Andrew Feng},
url = {http://arxiv.org/abs/2501.07015},
doi = {10.48550/arXiv.2501.07015},
year = {2025},
date = {2025-01-01},
urldate = {2025-01-16},
publisher = {arXiv},
abstract = {Achieving high-fidelity 3D reconstruction from monocular video remains challenging due to the inherent limitations of traditional methods like Structure-from-Motion (SfM) and monocular SLAM in accurately capturing scene details. While differentiable rendering techniques such as Neural Radiance Fields (NeRF) address some of these challenges, their high computational costs make them unsuitable for real-time applications. Additionally, existing 3D Gaussian Splatting (3DGS) methods often focus on photometric consistency, neglecting geometric accuracy and failing to exploit SLAM's dynamic depth and pose updates for scene refinement. We propose a framework integrating dense SLAM with 3DGS for real-time, high-fidelity dense reconstruction. Our approach introduces SLAM-Informed Adaptive Densification, which dynamically updates and densifies the Gaussian model by leveraging dense point clouds from SLAM. Additionally, we incorporate Geometry-Guided Optimization, which combines edge-aware geometric constraints and photometric consistency to jointly optimize the appearance and geometry of the 3DGS scene representation, enabling detailed and accurate SLAM mapping reconstruction. Experiments on the Replica and TUM-RGBD datasets demonstrate the effectiveness of our approach, achieving state-of-the-art results among monocular systems. Specifically, our method achieves a PSNR of 36.864, SSIM of 0.985, and LPIPS of 0.040 on Replica, representing improvements of 10.7%, 6.4%, and 49.4%, respectively, over the previous SOTA. On TUM-RGBD, our method outperforms the closest baseline by 10.2%, 6.6%, and 34.7% in the same metrics. These results highlight the potential of our framework in bridging the gap between photometric and geometric dense 3D scene representations, paving the way for practical and efficient monocular dense reconstruction.},
note = {arXiv:2501.07015 [cs]},
keywords = {VGL},
pubstate = {published},
tppubtype = {misc}
}
2024
Chen, Meida; Han, Kangle; Yu, Zifan; Feng, Andrew; Hou, Yu; You, Suya; Soibelman, Lucio
An Aerial Photogrammetry Benchmark Dataset for Point Cloud Segmentation and Style Translation Journal Article
In: Remote Sensing, vol. 16, no. 22, pp. 4240, 2024, ISSN: 2072-4292.
Abstract | Links | BibTeX | Tags: DTIC, VGL
@article{chen_aerial_2024,
title = {An Aerial Photogrammetry Benchmark Dataset for Point Cloud Segmentation and Style Translation},
author = {Meida Chen and Kangle Han and Zifan Yu and Andrew Feng and Yu Hou and Suya You and Lucio Soibelman},
url = {https://www.mdpi.com/2072-4292/16/22/4240},
doi = {10.3390/rs16224240},
issn = {2072-4292},
year = {2024},
date = {2024-11-01},
urldate = {2024-12-05},
journal = {Remote Sensing},
volume = {16},
number = {22},
pages = {4240},
abstract = {The recent surge in diverse 3D datasets spanning various scales and applications marks a significant advancement in the field. However, the comprehensive process of data acquisition, refinement, and annotation at a large scale poses a formidable challenge, particularly for individual researchers and small teams. To this end, we present a novel synthetic 3D point cloud generation framework that can produce detailed outdoor aerial photogrammetric 3D datasets with accurate ground truth annotations without the labor-intensive and time-consuming data collection/annotation processes. Our pipeline procedurally generates synthetic environments, mirroring real-world data collection and 3D reconstruction processes. A key feature of our framework is its ability to replicate consistent quality, noise patterns, and diversity similar to real-world datasets. This is achieved by adopting UAV flight patterns that resemble those used in real-world data collection processes (e.g., the cross-hatch flight pattern) across various synthetic terrains that are procedurally generated, thereby ensuring data consistency akin to real-world scenarios. Moreover, the generated datasets are enriched with precise semantic and instance annotations, eliminating the need for manual labeling. Our approach has led to the development and release of the Semantic Terrain Points Labeling—Synthetic 3D (STPLS3D) benchmark, an extensive outdoor 3D dataset encompassing over 16 km2, featuring up to 19 semantic labels. We also collected, reconstructed, and annotated four real-world datasets for validation purposes. Extensive experiments on these datasets demonstrate our synthetic datasets’ effectiveness, superior quality, and their value as a benchmark dataset for further point cloud research.},
keywords = {DTIC, VGL},
pubstate = {published},
tppubtype = {article}
}
Xiao, Hanyuan; Chen, Yingshu; Huang, Huajian; Xiong, Haolin; Yang, Jing; Prasad, Pratusha; Zhao, Yajie
Localized Gaussian Splatting Editing with Contextual Awareness Miscellaneous
2024, (arXiv:2408.00083 [cs]).
Abstract | Links | BibTeX | Tags: DTIC, VGL
@misc{xiao_localized_2024,
title = {Localized Gaussian Splatting Editing with Contextual Awareness},
author = {Hanyuan Xiao and Yingshu Chen and Huajian Huang and Haolin Xiong and Jing Yang and Pratusha Prasad and Yajie Zhao},
url = {http://arxiv.org/abs/2408.00083},
year = {2024},
date = {2024-07-01},
urldate = {2024-08-16},
publisher = {arXiv},
abstract = {Recent text-guided generation of individual 3D object has achieved great success using diffusion priors. However, these methods are not suitable for object insertion and replacement tasks as they do not consider the background, leading to illumination mismatches within the environment. To bridge the gap, we introduce an illumination-aware 3D scene editing pipeline for 3D Gaussian Splatting (3DGS) representation. Our key observation is that inpainting by the state-of-the-art conditional 2D diffusion model is consistent with background in lighting. To leverage the prior knowledge from the well-trained diffusion models for 3D object generation, our approach employs a coarse-to-fine objection optimization pipeline with inpainted views. In the first coarse step, we achieve image-to-3D lifting given an ideal inpainted view. The process employs 3D-aware diffusion prior from a view-conditioned diffusion model, which preserves illumination present in the conditioning image. To acquire an ideal inpainted image, we introduce an Anchor View Proposal (AVP) algorithm to find a single view that best represents the scene illumination in target region. In the second Texture Enhancement step, we introduce a novel Depth-guided Inpainting Score Distillation Sampling (DI-SDS), which enhances geometry and texture details with the inpainting diffusion prior, beyond the scope of the 3D-aware diffusion prior knowledge in the first coarse step. DI-SDS not only provides fine-grained texture enhancement, but also urges optimization to respect scene lighting. Our approach efficiently achieves local editing with global illumination consistency without explicitly modeling light transport. We demonstrate robustness of our method by evaluating editing in real scenes containing explicit highlight and shadows, and compare against the state-of-the-art text-to-3D editing methods.},
note = {arXiv:2408.00083 [cs]},
keywords = {DTIC, VGL},
pubstate = {published},
tppubtype = {misc}
}
Chen, Meida; Lal, Devashish; Yu, Zifan; Xu, Jiuyi; Feng, Andrew; You, Suya; Nurunnabi, Abdul; Shi, Yangming
Large-Scale 3D Terrain Reconstruction Using 3D Gaussian Splatting for Visualization and Simulation Journal Article
In: Int. Arch. Photogramm. Remote Sens. Spatial Inf. Sci., vol. XLVIII-2-2024, pp. 49–54, 2024, ISSN: 2194-9034.
Abstract | Links | BibTeX | Tags: DTIC, Graphics, VGL
@article{chen_large-scale_2024,
title = {Large-Scale 3D Terrain Reconstruction Using 3D Gaussian Splatting for Visualization and Simulation},
author = {Meida Chen and Devashish Lal and Zifan Yu and Jiuyi Xu and Andrew Feng and Suya You and Abdul Nurunnabi and Yangming Shi},
url = {https://isprs-archives.copernicus.org/articles/XLVIII-2-2024/49/2024/},
doi = {10.5194/isprs-archives-XLVIII-2-2024-49-2024},
issn = {2194-9034},
year = {2024},
date = {2024-06-01},
urldate = {2024-06-20},
journal = {Int. Arch. Photogramm. Remote Sens. Spatial Inf. Sci.},
volume = {XLVIII-2-2024},
pages = {49–54},
abstract = {Abstract. The fusion of low-cost unmanned aerial systems (UAS) with advanced photogrammetric techniques has revolutionized 3D terrain reconstruction, enabling the automated creation of detailed models. Concurrently, the advent of 3D Gaussian Splatting has introduced a paradigm shift in 3D data representation, offering visually realistic renditions distinct from traditional polygon-based models. Our research builds upon this foundation, aiming to integrate Gaussian Splatting into interactive simulations for immersive virtual environments. We address challenges such as collision detection by adopting a hybrid approach, combining Gaussian Splatting with photogrammetry-derived meshes. Through comprehensive experimentation covering varying terrain sizes and Gaussian densities, we evaluate scalability, performance, and limitations. Our findings contribute to advancing the use of advanced computer graphics techniques for enhanced 3D terrain visualization and simulation.},
keywords = {DTIC, Graphics, VGL},
pubstate = {published},
tppubtype = {article}
}
Zhang, Mingyuan; Cai, Zhongang; Pan, Liang; Hong, Fangzhou; Guo, Xinying; Yang, Lei; Liu, Ziwei
MotionDiffuse: Text-Driven Human Motion Generation With Diffusion Model Journal Article
In: IEEE Trans. Pattern Anal. Mach. Intell., vol. 46, no. 6, pp. 4115–4128, 2024, ISSN: 0162-8828, 2160-9292, 1939-3539.
@article{zhang_motiondiffuse_2024,
title = {MotionDiffuse: Text-Driven Human Motion Generation With Diffusion Model},
author = {Mingyuan Zhang and Zhongang Cai and Liang Pan and Fangzhou Hong and Xinying Guo and Lei Yang and Ziwei Liu},
url = {https://ieeexplore.ieee.org/document/10416192/},
doi = {10.1109/TPAMI.2024.3355414},
issn = {0162-8828, 2160-9292, 1939-3539},
year = {2024},
date = {2024-06-01},
urldate = {2024-07-18},
journal = {IEEE Trans. Pattern Anal. Mach. Intell.},
volume = {46},
number = {6},
pages = {4115–4128},
keywords = {VGL},
pubstate = {published},
tppubtype = {article}
}
Nurunnabi, Abdul; Teferle, Felicia; Laefer, Debra F.; Chen, Meida; Ali, Mir Masoom
Development of a Precise Tree Structure from LiDAR Point Clouds Journal Article
In: Int. Arch. Photogramm. Remote Sens. Spatial Inf. Sci., vol. XLVIII-2-2024, pp. 301–308, 2024, ISSN: 2194-9034.
Abstract | Links | BibTeX | Tags: Narrative, VGL
@article{nurunnabi_development_2024,
title = {Development of a Precise Tree Structure from LiDAR Point Clouds},
author = {Abdul Nurunnabi and Felicia Teferle and Debra F. Laefer and Meida Chen and Mir Masoom Ali},
url = {https://isprs-archives.copernicus.org/articles/XLVIII-2-2024/301/2024/},
doi = {10.5194/isprs-archives-XLVIII-2-2024-301-2024},
issn = {2194-9034},
year = {2024},
date = {2024-06-01},
urldate = {2024-07-11},
journal = {Int. Arch. Photogramm. Remote Sens. Spatial Inf. Sci.},
volume = {XLVIII-2-2024},
pages = {301–308},
abstract = {Abstract. A precise tree structure that represents the distribution of tree stem, branches, and leaves is crucial for accurately capturing the full representation of a tree. Light Detection and Ranging (LiDAR)-based three-dimensional (3D) point clouds (PCs) capture the geometry of scanned objects including forests stands and individual trees. PCs are irregular, unstructured, often noisy, and contaminated by outliers. Researchers have struggled to develop methods to separate leaves and wood without losing the tree geometry. This paper proposes a solution that employs only the spatial coordinates (x, y, z) of the PC. The new algorithm works as a filtering approach, utilizing multi-scale neighborhood-based geometric features (GFs) e.g., linearity, planarity, and verticality to classify linear (wood) and non-linear (leaf) points. This involves finding potential wood points and coupling them with an octree-based segmentation to develop a tree architecture. The main contributions of this paper are (i) investigating the potential of different GFs to split linear and non-linear points, (ii) introducing a novel method that pointwise classifies leaf and wood points, and (iii) developing a precise 3D tree structure. The performance of the new algorithm has been demonstrated through terrestrial laser scanning PCs. For a Scots pine tree, the new method classifies leaf and wood points with an overall accuracy of 97.9%.},
keywords = {Narrative, VGL},
pubstate = {published},
tppubtype = {article}
}
Zhang, Hao; Chang, Di; Li, Fang; Soleymani, Mohammad; Ahuja, Narendra
MagicPose4D: Crafting Articulated Models with Appearance and Motion Control Miscellaneous
2024, (Version Number: 1).
Abstract | Links | BibTeX | Tags: VGL, Virtual Humans
@misc{zhang_magicpose4d_2024,
title = {MagicPose4D: Crafting Articulated Models with Appearance and Motion Control},
author = {Hao Zhang and Di Chang and Fang Li and Mohammad Soleymani and Narendra Ahuja},
url = {https://arxiv.org/abs/2405.14017},
doi = {10.48550/ARXIV.2405.14017},
year = {2024},
date = {2024-05-01},
urldate = {2024-06-25},
publisher = {arXiv},
abstract = {With the success of 2D and 3D visual generative models, there is growing interest in generating 4D content. Existing methods primarily rely on text prompts to produce 4D content, but they often fall short of accurately defining complex or rare motions. To address this limitation, we propose MagicPose4D, a novel framework for refined control over both appearance and motion in 4D generation. Unlike traditional methods, MagicPose4D accepts monocular videos as motion prompts, enabling precise and customizable motion generation. MagicPose4D comprises two key modules:
i) Dual-Phase 4D Reconstruction Modulevphantom which operates in two phases. The first phase focuses on capturing the model's shape using accurate 2D supervision and less accurate but geometrically informative 3D pseudo-supervision without imposing skeleton constraints. The second phase refines the model using more accurate pseudo-3D supervision, obtained in the first phase and introduces kinematic chain-based skeleton constraints to ensure physical plausibility. Additionally, we propose a Global-local Chamfer loss that aligns the overall distribution of predicted mesh vertices with the supervision while maintaining part-level alignment without extra annotations.
ii) Cross-category Motion Transfer Modulevphantom leverages the predictions from the 4D reconstruction module and uses a kinematic-chain-based skeleton to achieve cross-category motion transfer. It ensures smooth transitions between frames through dynamic rigidity, facilitating robust generalization without additional training.
Through extensive experiments, we demonstrate that MagicPose4D significantly improves the accuracy and consistency of 4D content generation, outperforming existing methods in various benchmarks.},
note = {Version Number: 1},
keywords = {VGL, Virtual Humans},
pubstate = {published},
tppubtype = {misc}
}
i) Dual-Phase 4D Reconstruction Modulevphantom which operates in two phases. The first phase focuses on capturing the model's shape using accurate 2D supervision and less accurate but geometrically informative 3D pseudo-supervision without imposing skeleton constraints. The second phase refines the model using more accurate pseudo-3D supervision, obtained in the first phase and introduces kinematic chain-based skeleton constraints to ensure physical plausibility. Additionally, we propose a Global-local Chamfer loss that aligns the overall distribution of predicted mesh vertices with the supervision while maintaining part-level alignment without extra annotations.
ii) Cross-category Motion Transfer Modulevphantom leverages the predictions from the 4D reconstruction module and uses a kinematic-chain-based skeleton to achieve cross-category motion transfer. It ensures smooth transitions between frames through dynamic rigidity, facilitating robust generalization without additional training.
Through extensive experiments, we demonstrate that MagicPose4D significantly improves the accuracy and consistency of 4D content generation, outperforming existing methods in various benchmarks.
Liu, Rong; Xu, Rui; Hu, Yue; Chen, Meida; Feng, Andrew
AtomGS: Atomizing Gaussian Splatting for High-Fidelity Radiance Field Miscellaneous
2024, (Version Number: 2).
Abstract | Links | BibTeX | Tags: Graphics, VGL
@misc{liu_atomgs_2024,
title = {AtomGS: Atomizing Gaussian Splatting for High-Fidelity Radiance Field},
author = {Rong Liu and Rui Xu and Yue Hu and Meida Chen and Andrew Feng},
url = {https://arxiv.org/abs/2405.12369},
doi = {10.48550/ARXIV.2405.12369},
year = {2024},
date = {2024-05-01},
urldate = {2024-07-11},
publisher = {arXiv},
abstract = {3D Gaussian Splatting (3DGS) has recently advanced radiance field reconstruction by offering superior capabilities for novel view synthesis and real-time rendering speed. However, its strategy of blending optimization and adaptive density control might lead to sub-optimal results; it can sometimes yield noisy geometry and blurry artifacts due to prioritizing optimizing large Gaussians at the cost of adequately densifying smaller ones. To address this, we introduce AtomGS, consisting of Atomized Proliferation and Geometry-Guided Optimization. The Atomized Proliferation constrains ellipsoid Gaussians of various sizes into more uniform-sized Atom Gaussians. The strategy enhances the representation of areas with fine features by placing greater emphasis on densification in accordance with scene details. In addition, we proposed a Geometry-Guided Optimization approach that incorporates an Edge-Aware Normal Loss. This optimization method effectively smooths flat surfaces while preserving intricate details. Our evaluation shows that AtomGS outperforms existing state-of-the-art methods in rendering quality. Additionally, it achieves competitive accuracy in geometry reconstruction and offers a significant improvement in training speed over other SDF-based methods. More interactive demos can be found in our website (https://rongliu-leo.github.io/AtomGS/).},
note = {Version Number: 2},
keywords = {Graphics, VGL},
pubstate = {published},
tppubtype = {misc}
}
Zhang, Hui; Kuang, Bingran; Zhao, Yajie
Camera Calibration using a Single View of a Symmetric Object Proceedings Article
In: ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 2705–2709, IEEE, Seoul, Korea, Republic of, 2024, ISBN: 979-8-3503-4485-1.
Links | BibTeX | Tags: Graphics, VGL
@inproceedings{zhang_camera_2024,
title = {Camera Calibration using a Single View of a Symmetric Object},
author = {Hui Zhang and Bingran Kuang and Yajie Zhao},
url = {https://ieeexplore.ieee.org/document/10446005/},
doi = {10.1109/ICASSP48485.2024.10446005},
isbn = {979-8-3503-4485-1},
year = {2024},
date = {2024-04-01},
urldate = {2024-06-25},
booktitle = {ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages = {2705–2709},
publisher = {IEEE},
address = {Seoul, Korea, Republic of},
keywords = {Graphics, VGL},
pubstate = {published},
tppubtype = {inproceedings}
}
Chen, Haiwei; Zhao, Yajie
Don't Look into the Dark: Latent Codes for Pluralistic Image Inpainting Miscellaneous
2024, (arXiv:2403.18186 [cs]).
Abstract | Links | BibTeX | Tags: VGL
@misc{chen_dont_2024,
title = {Don't Look into the Dark: Latent Codes for Pluralistic Image Inpainting},
author = {Haiwei Chen and Yajie Zhao},
url = {http://arxiv.org/abs/2403.18186},
year = {2024},
date = {2024-03-01},
urldate = {2024-08-15},
publisher = {arXiv},
abstract = {We present a method for large-mask pluralistic image inpainting based on the generative framework of discrete latent codes. Our method learns latent priors, discretized as tokens, by only performing computations at the visible locations of the image. This is realized by a restrictive partial encoder that predicts the token label for each visible block, a bidirectional transformer that infers the missing labels by only looking at these tokens, and a dedicated synthesis network that couples the tokens with the partial image priors to generate coherent and pluralistic complete image even under extreme mask settings. Experiments on public benchmarks validate our design choices as the proposed method outperforms strong baselines in both visual quality and diversity metrics.},
note = {arXiv:2403.18186 [cs]},
keywords = {VGL},
pubstate = {published},
tppubtype = {misc}
}
2023
Yang, Jing; Xiao, Hanyuan; Teng, Wenbin; Cai, Yunxuan; Zhao, Yajie
Light Sampling Field and BRDF Representation for Physically-based Neural Rendering Journal Article
In: 2023, (Publisher: arXiv Version Number: 1).
Abstract | Links | BibTeX | Tags: DTIC, UARC, VGL
@article{yang_light_2023,
title = {Light Sampling Field and BRDF Representation for Physically-based Neural Rendering},
author = {Jing Yang and Hanyuan Xiao and Wenbin Teng and Yunxuan Cai and Yajie Zhao},
url = {https://arxiv.org/abs/2304.05472},
doi = {10.48550/ARXIV.2304.05472},
year = {2023},
date = {2023-04-01},
urldate = {2023-08-22},
abstract = {Physically-based rendering (PBR) is key for immersive rendering effects used widely in the industry to showcase detailed realistic scenes from computer graphics assets. A well-known caveat is that producing the same is computationally heavy and relies on complex capture devices. Inspired by the success in quality and efficiency of recent volumetric neural rendering, we want to develop a physically-based neural shader to eliminate device dependency and significantly boost performance. However, no existing lighting and material models in the current neural rendering approaches can accurately represent the comprehensive lighting models and BRDFs properties required by the PBR process. Thus, this paper proposes a novel lighting representation that models direct and indirect light locally through a light sampling strategy in a learned light sampling field. We also propose BRDF models to separately represent surface/subsurface scattering details to enable complex objects such as translucent material (i.e., skin, jade). We then implement our proposed representations with an end-to-end physically-based neural face skin shader, which takes a standard face asset (i.e., geometry, albedo map, and normal map) and an HDRI for illumination as inputs and generates a photo-realistic rendering as output. Extensive experiments showcase the quality and efficiency of our PBR face skin shader, indicating the effectiveness of our proposed lighting and material representations.},
note = {Publisher: arXiv
Version Number: 1},
keywords = {DTIC, UARC, VGL},
pubstate = {published},
tppubtype = {article}
}
Chemburkar, Ankur; Lu, Shuhong; Feng, Andrew
MoDDM: Text-to-Motion Synthesis using Discrete Diffusion Model Proceedings Article
In: 2023.
Abstract | Links | BibTeX | Tags: VGL
@inproceedings{chemburkar_moddm_2023,
title = {MoDDM: Text-to-Motion Synthesis using Discrete Diffusion Model},
author = {Ankur Chemburkar and Shuhong Lu and Andrew Feng},
url = {chrome-extension://efaidnbmnnnibpcajpcglclefindmkaj/https://papers.bmvc2023.org/0624.pdf},
year = {2023},
date = {2023-01-01},
abstract = {We present the motion discrete diffusion model (MoDDM) for synthesizing human motion from text descriptions that addresses challenges in cross-modal mapping and motion diversity. The previous methods that utilized variational autoencoder (VAE) to learn the latent distributions for text-to-motion synthesis tend to produce motions with less diversity and fidelity. While the diffusion models show promising results by generating high quality motions, they require higher computational costs and may produce motions less aligned with the input text. The proposed method combines the discrete latent space and diffusion models to learn an expressive conditional probabilistic mapping for motion synthesis. Our method utilizes vector quantization variational autoencoder (VQ-VAE) to learn discrete motion tokens and then applies discrete denoising diffusion probabilistic models (D3PM) to learn the conditional probability distributions for the motion tokens. The discrete classifier-free guidance is further utilized in the training process with proper guidance scale for aligning the motions and the corresponding text descriptions. By learning the denoising model in the discrete latent space, the method produces high quality motion results while greatly reducing computational costs compared to training the diffusion models on raw motion sequences. The evaluation results show that the proposed approach outperforms previous methods in both motion quality and text-to-motion matching accuracy.},
keywords = {VGL},
pubstate = {published},
tppubtype = {inproceedings}
}
2022
Liu, Shichen; Cai, Yunxuan; Chen, Haiwei; Zhou, Yichao; Zhao, Yajie
Rapid Face Asset Acquisition with Recurrent Feature Alignment Journal Article
In: ACM Trans. Graph., vol. 41, no. 6, pp. 214:1–214:17, 2022, ISSN: 0730-0301.
Abstract | Links | BibTeX | Tags: DTIC, VGL
@article{liu_rapid_2022,
title = {Rapid Face Asset Acquisition with Recurrent Feature Alignment},
author = {Shichen Liu and Yunxuan Cai and Haiwei Chen and Yichao Zhou and Yajie Zhao},
url = {https://dl.acm.org/doi/10.1145/3550454.3555509},
doi = {10.1145/3550454.3555509},
issn = {0730-0301},
year = {2022},
date = {2022-11-01},
urldate = {2023-03-31},
journal = {ACM Trans. Graph.},
volume = {41},
number = {6},
pages = {214:1–214:17},
abstract = {We present Recurrent Feature Alignment (ReFA), an end-to-end neural network for the very rapid creation of production-grade face assets from multi-view images. ReFA is on par with the industrial pipelines in quality for producing accurate, complete, registered, and textured assets directly applicable to physically-based rendering, but produces the asset end-to-end, fully automatically at a significantly faster speed at 4.5 FPS, which is unprecedented among neural-based techniques. Our method represents face geometry as a position map in the UV space. The network first extracts per-pixel features in both the multi-view image space and the UV space. A recurrent module then iteratively optimizes the geometry by projecting the image-space features to the UV space and comparing them with a reference UV-space feature. The optimized geometry then provides pixel-aligned signals for the inference of high-resolution textures. Experiments have validated that ReFA achieves a median error of 0.603mm in geometry reconstruction, is robust to extreme pose and expression, and excels in sparse-view settings. We believe that the progress achieved by our network enables lightweight, fast face assets acquisition that significantly boosts the downstream applications, such as avatar creation and facial performance capture. It will also enable massive database capturing for deep learning purposes.},
keywords = {DTIC, VGL},
pubstate = {published},
tppubtype = {article}
}
Kuang, Zhengfei; Li, Jiaman; He, Mingming; Wang, Tong; Zhao, Yajie
DenseGAP: Graph-Structured Dense Correspondence Learning with Anchor Points Proceedings Article
In: pp. 542–549, IEEE Computer Society, 2022, ISBN: 978-1-6654-9062-7.
Abstract | Links | BibTeX | Tags: VGL
@inproceedings{kuang_densegap_2022,
title = {DenseGAP: Graph-Structured Dense Correspondence Learning with Anchor Points},
author = {Zhengfei Kuang and Jiaman Li and Mingming He and Tong Wang and Yajie Zhao},
url = {https://www.computer.org/csdl/proceedings-article/icpr/2022/09956472/1IHpppIuqOc},
doi = {10.1109/ICPR56361.2022.9956472},
isbn = {978-1-6654-9062-7},
year = {2022},
date = {2022-08-01},
urldate = {2023-03-31},
pages = {542–549},
publisher = {IEEE Computer Society},
abstract = {Establishing dense correspondence between two images is a fundamental computer vision problem, which is typically tackled by matching local feature descriptors. However, without global awareness, such local features are often insufficient for disambiguating similar regions. And computing the pairwise feature correlation across images is both computation-expensive and memory-intensive. To make the local features aware of the global context and improve their matching accuracy, we introduce DenseGAP, a new solution for efficient Dense correspondence learning with a Graph-structured neural network conditioned on Anchor Points. Specifically, we first propose a graph structure that utilizes anchor points to provide sparse but reliable prior on inter- and intra-image context and propagates them to all image points via directed edges. We also design a graph-structured network to broadcast multi-level contexts via light-weighted message-passing layers and generate high-resolution feature maps at low memory cost. Finally, based on the predicted feature maps, we introduce a coarse-to-fine framework for accurate correspondence prediction using cycle consistency. Our feature descriptors capture both local and global information, thus enabling a continuous feature field for querying arbitrary points at high resolution. Through comprehensive ablative experiments and evaluations on large-scale indoor and outdoor datasets, we demonstrate that our method advances the state-of-the-art of correspondence learning on most benchmarks.},
keywords = {VGL},
pubstate = {published},
tppubtype = {inproceedings}
}
Chen, Haiwei; Liu, Jiayi; Chen, Weikai; Liu, Shichen; Zhao, Yajie
Exemplar-based Pattern Synthesis with Implicit Periodic Field Network Proceedings Article
In: 2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3698–3707, IEEE, New Orleans, LA, USA, 2022, ISBN: 978-1-6654-6946-3.
Links | BibTeX | Tags: UARC, VGL
@inproceedings{chen_exemplar-based_2022,
title = {Exemplar-based Pattern Synthesis with Implicit Periodic Field Network},
author = {Haiwei Chen and Jiayi Liu and Weikai Chen and Shichen Liu and Yajie Zhao},
url = {https://ieeexplore.ieee.org/document/9879904/},
doi = {10.1109/CVPR52688.2022.00369},
isbn = {978-1-6654-6946-3},
year = {2022},
date = {2022-06-01},
urldate = {2023-02-10},
booktitle = {2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
pages = {3698–3707},
publisher = {IEEE},
address = {New Orleans, LA, USA},
keywords = {UARC, VGL},
pubstate = {published},
tppubtype = {inproceedings}
}
Danieau, Fabien; Guillotel, Philippe; Hoyet, Ludovic; Tonneau, Steve; Zhao, Yajie
Editorial: Creating Lifelike Digital Humans Journal Article
In: Front. Virtual Real., vol. 3, pp. 906118, 2022, ISSN: 2673-4192.
@article{danieau_editorial_2022,
title = {Editorial: Creating Lifelike Digital Humans},
author = {Fabien Danieau and Philippe Guillotel and Ludovic Hoyet and Steve Tonneau and Yajie Zhao},
url = {https://www.frontiersin.org/articles/10.3389/frvir.2022.906118/full},
doi = {10.3389/frvir.2022.906118},
issn = {2673-4192},
year = {2022},
date = {2022-04-01},
urldate = {2024-08-13},
journal = {Front. Virtual Real.},
volume = {3},
pages = {906118},
keywords = {VGL},
pubstate = {published},
tppubtype = {article}
}
Liu, Shichen; Li, Tianye; Chen, Weikai; Li, Hao
A General Differentiable Mesh Renderer for Image-Based 3D Reasoning Journal Article
In: IEEE Transactions on Pattern Analysis and Machine Intelligence, vol. 44, no. 1, pp. 50–62, 2022, ISSN: 1939-3539, (Conference Name: IEEE Transactions on Pattern Analysis and Machine Intelligence).
Abstract | Links | BibTeX | Tags: VGL
@article{liu_general_2022,
title = {A General Differentiable Mesh Renderer for Image-Based 3D Reasoning},
author = {Shichen Liu and Tianye Li and Weikai Chen and Hao Li},
doi = {10.1109/TPAMI.2020.3007759},
issn = {1939-3539},
year = {2022},
date = {2022-01-01},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
volume = {44},
number = {1},
pages = {50–62},
abstract = {Rendering bridges the gap between 2D vision and 3D scenes by simulating the physical process of image formation. By inverting such renderer, one can think of a learning approach to infer 3D information from 2D images. However, standard graphics renderers involve a fundamental step called rasterization, which prevents rendering to be differentiable. Unlike the state-of-the-art differentiable renderers (Kato et al. 2018 and Loper 2018), which only approximate the rendering gradient in the backpropagation, we propose a natually differentiable rendering framework that is able to (1) directly render colorized mesh using differentiable functions and (2) back-propagate efficient supervisions to mesh vertices and their attributes from various forms of image representations. The key to our framework is a novel formulation that views rendering as an aggregation function that fuses the probabilistic contributions of all mesh triangles with respect to the rendered pixels. Such formulation enables our framework to flow gradients to the occluded and distant vertices, which cannot be achieved by the previous state-of-the-arts. We show that by using the proposed renderer, one can achieve significant improvement in 3D unsupervised single-view reconstruction both qualitatively and quantitatively. Experiments also demonstrate that our approach can handle the challenging tasks in image-based shape fitting, which remain nontrivial to existing differentiable renders.},
note = {Conference Name: IEEE Transactions on Pattern Analysis and Machine Intelligence},
keywords = {VGL},
pubstate = {published},
tppubtype = {article}
}
2021
Li, Jiaman; Villegas, Ruben; Ceylan, Duygu; Yang, Jimei; Kuang, Zhengfei; Li, Hao; Zhao, Yajie
Task-Generic Hierarchical Human Motion Prior using VAEs Proceedings Article
In: 2021 International Conference on 3D Vision (3DV), pp. 771–781, IEEE, London, United Kingdom, 2021, ISBN: 978-1-6654-2688-6.
Links | BibTeX | Tags: DTIC, UARC, VGL
@inproceedings{li_task-generic_2021,
title = {Task-Generic Hierarchical Human Motion Prior using VAEs},
author = {Jiaman Li and Ruben Villegas and Duygu Ceylan and Jimei Yang and Zhengfei Kuang and Hao Li and Yajie Zhao},
url = {https://ieeexplore.ieee.org/document/9665881/},
doi = {10.1109/3DV53792.2021.00086},
isbn = {978-1-6654-2688-6},
year = {2021},
date = {2021-12-01},
urldate = {2022-09-22},
booktitle = {2021 International Conference on 3D Vision (3DV)},
pages = {771–781},
publisher = {IEEE},
address = {London, United Kingdom},
keywords = {DTIC, UARC, VGL},
pubstate = {published},
tppubtype = {inproceedings}
}
Liu, Shichen; Zhou, Yichao; Zhao, Yajie
VaPiD: A Rapid Vanishing Point Detector via Learned Optimizers Proceedings Article
In: 2021 IEEE/CVF International Conference on Computer Vision (ICCV), pp. 12839–12848, IEEE, Montreal, QC, Canada, 2021, ISBN: 978-1-6654-2812-5.
Links | BibTeX | Tags: DTIC, UARC, VGL
@inproceedings{liu_vapid_2021,
title = {VaPiD: A Rapid Vanishing Point Detector via Learned Optimizers},
author = {Shichen Liu and Yichao Zhou and Yajie Zhao},
url = {https://ieeexplore.ieee.org/document/9711313/},
doi = {10.1109/ICCV48922.2021.01262},
isbn = {978-1-6654-2812-5},
year = {2021},
date = {2021-10-01},
urldate = {2022-09-22},
booktitle = {2021 IEEE/CVF International Conference on Computer Vision (ICCV)},
pages = {12839–12848},
publisher = {IEEE},
address = {Montreal, QC, Canada},
keywords = {DTIC, UARC, VGL},
pubstate = {published},
tppubtype = {inproceedings}
}
Li, Tianye; Liu, Shichen; Bolkart, Timo; Liu, Jiayi; Li, Hao; Zhao, Yajie
Topologically Consistent Multi-View Face Inference Using Volumetric Sampling Proceedings Article
In: 2021 IEEE/CVF International Conference on Computer Vision (ICCV), pp. 3804–3814, IEEE, Montreal, QC, Canada, 2021, ISBN: 978-1-6654-2812-5.
Links | BibTeX | Tags: DTIC, UARC, VGL
@inproceedings{li_topologically_2021,
title = {Topologically Consistent Multi-View Face Inference Using Volumetric Sampling},
author = {Tianye Li and Shichen Liu and Timo Bolkart and Jiayi Liu and Hao Li and Yajie Zhao},
url = {https://ieeexplore.ieee.org/document/9711264/},
doi = {10.1109/ICCV48922.2021.00380},
isbn = {978-1-6654-2812-5},
year = {2021},
date = {2021-10-01},
urldate = {2022-09-22},
booktitle = {2021 IEEE/CVF International Conference on Computer Vision (ICCV)},
pages = {3804–3814},
publisher = {IEEE},
address = {Montreal, QC, Canada},
keywords = {DTIC, UARC, VGL},
pubstate = {published},
tppubtype = {inproceedings}
}
Xiang, Sitao; Gu, Yuming; Xiang, Pengda; Chai, Menglei; Li, Hao; Zhao, Yajie; He, Mingming
DisUnknown: Distilling Unknown Factors for Disentanglement Learning Proceedings Article
In: 2021 IEEE/CVF International Conference on Computer Vision (ICCV), pp. 14790–14799, IEEE, Montreal, QC, Canada, 2021, ISBN: 978-1-6654-2812-5.
Links | BibTeX | Tags: DTIC, UARC, VGL
@inproceedings{xiang_disunknown_2021,
title = {DisUnknown: Distilling Unknown Factors for Disentanglement Learning},
author = {Sitao Xiang and Yuming Gu and Pengda Xiang and Menglei Chai and Hao Li and Yajie Zhao and Mingming He},
url = {https://ieeexplore.ieee.org/document/9709965/},
doi = {10.1109/ICCV48922.2021.01454},
isbn = {978-1-6654-2812-5},
year = {2021},
date = {2021-10-01},
urldate = {2022-09-23},
booktitle = {2021 IEEE/CVF International Conference on Computer Vision (ICCV)},
pages = {14790–14799},
publisher = {IEEE},
address = {Montreal, QC, Canada},
keywords = {DTIC, UARC, VGL},
pubstate = {published},
tppubtype = {inproceedings}
}
Xiang, Sitao
Eliminating topological errors in neural network rotation estimation using self-selecting ensembles Journal Article
In: ACM Trans. Graph., vol. 40, no. 4, pp. 167:1–167:21, 2021, ISSN: 0730-0301.
Abstract | Links | BibTeX | Tags: VGL
@article{xiang_eliminating_2021,
title = {Eliminating topological errors in neural network rotation estimation using self-selecting ensembles},
author = {Sitao Xiang},
url = {https://dl.acm.org/doi/10.1145/3450626.3459882},
doi = {10.1145/3450626.3459882},
issn = {0730-0301},
year = {2021},
date = {2021-07-01},
urldate = {2023-03-31},
journal = {ACM Trans. Graph.},
volume = {40},
number = {4},
pages = {167:1–167:21},
abstract = {Many problems in computer graphics and computer vision applications involves inferring a rotation from a variety of different forms of inputs. With the increasing use of deep learning, neural networks have been employed to solve such problems. However, the traditional representations for 3D rotations, the quaternions and Euler angles, are found to be problematic for neural networks in practice, producing seemingly unavoidable large estimation errors. Previous researches has identified the discontinuity of the mapping from SO(3) to the quaternions or Euler angles as the source of such errors, and to solve it, embeddings of SO(3) have been proposed as the output representation of rotation estimation networks instead. In this paper, we argue that the argument against quaternions and Euler angles from local discontinuities of the mappings from SO(3) is flawed, and instead provide a different argument from the global topological properties of SO(3) that also establishes the lower bound of maximum error when using quaternions and Euler angles for rotation estimation networks. Extending from this view, we discover that rotation symmetries in the input object causes additional topological problems that even using embeddings of SO(3) as the output representation would not correctly handle. We propose the self-selecting ensemble, a topologically motivated approach, where the network makes multiple predictions and assigns weights to them. We show theoretically and with experiments that our methods can be combined with a wide range of different rotation representations and can handle all kinds of finite symmetries in 3D rotation estimation problems.},
keywords = {VGL},
pubstate = {published},
tppubtype = {article}
}
Chen, Haiwei; Liu, Shichen; Chen, Weikai; Li, Hao; Hill, Randall
Equivariant Point Network for 3D Point Cloud Analysis Proceedings Article
In: pp. 14514–14523, 2021.
Links | BibTeX | Tags: UARC, VGL
@inproceedings{chen_equivariant_2021,
title = {Equivariant Point Network for 3D Point Cloud Analysis},
author = {Haiwei Chen and Shichen Liu and Weikai Chen and Hao Li and Randall Hill},
url = {https://openaccess.thecvf.com/content/CVPR2021/html/Chen_Equivariant_Point_Network_for_3D_Point_Cloud_Analysis_CVPR_2021_paper.html},
year = {2021},
date = {2021-01-01},
urldate = {2023-03-31},
pages = {14514–14523},
keywords = {UARC, VGL},
pubstate = {published},
tppubtype = {inproceedings}
}
2020
Zhou, Yi; Wu, Chenglei; Li, Zimo; Cao, Chen; Ye, Yuting; Saragih, Jason; Li, Hao; Sheikh, Yaser
Fully convolutional mesh autoencoder using efficient spatially varying kernels Proceedings Article
In: Proceedings of the 34th International Conference on Neural Information Processing Systems, pp. 9251–9262, Curran Associates Inc., Red Hook, NY, USA, 2020, ISBN: 978-1-7138-2954-6.
@inproceedings{zhou_fully_2020,
title = {Fully convolutional mesh autoencoder using efficient spatially varying kernels},
author = {Yi Zhou and Chenglei Wu and Zimo Li and Chen Cao and Yuting Ye and Jason Saragih and Hao Li and Yaser Sheikh},
isbn = {978-1-7138-2954-6},
year = {2020},
date = {2020-12-01},
urldate = {2023-03-31},
booktitle = {Proceedings of the 34th International Conference on Neural Information Processing Systems},
pages = {9251–9262},
publisher = {Curran Associates Inc.},
address = {Red Hook, NY, USA},
series = {NIPS'20},
abstract = {Learning latent representations of registered meshes is useful for many 3D tasks. Techniques have recently shifted to neural mesh autoencoders. Although they demonstrate higher precision than traditional methods, they remain unable to capture fine-grained deformations. Furthermore, these methods can only be applied to a template-specific surface mesh, and is not applicable to more general meshes, like tetrahedrons and non-manifold meshes. While more general graph convolution methods can be employed, they lack performance in reconstruction precision and require higher memory usage. In this paper, we propose a non-template-specific fully convolutional mesh autoencoder for arbitrary registered mesh data. It is enabled by our novel convolution and (un)pooling operators learned with globally shared weights and locally varying coefficients which can efficiently capture the spatially varying contents presented by irregular mesh connections. Our model outperforms state-of-the-art methods on reconstruction accuracy. In addition, the latent codes of our network are fully localized thanks to the fully convolutional structure, and thus have much higher interpolation capability than many traditional 3D mesh generation models.},
keywords = {VGL},
pubstate = {published},
tppubtype = {inproceedings}
}
0000
Chen, Haiwei; Zhao, Yajie
Don't Look into the Dark: Latent Codes for Pluralistic Image Inpainting Proceedings Article
In: pp. 7591–7600, 0000.
Abstract | Links | BibTeX | Tags: DTIC, Graphics, VGL
@inproceedings{chen_dont_nodate,
title = {Don't Look into the Dark: Latent Codes for Pluralistic Image Inpainting},
author = {Haiwei Chen and Yajie Zhao},
url = {https://openaccess.thecvf.com/content/CVPR2024/html/Chen_Dont_Look_into_the_Dark_Latent_Codes_for_Pluralistic_Image_CVPR_2024_paper.html},
pages = {7591–7600},
abstract = {We present a method for large-mask pluralistic image inpainting based on the generative framework of discrete latent codes. Our method learns latent priors discretized as tokens by only performing computations at the visible locations of the image. This is realized by a restrictive partial encoder that predicts the token label for each visible block a bidirectional transformer that infers the missing labels by only looking at these tokens and a dedicated synthesis network that couples the tokens with the partial image priors to generate coherent and pluralistic complete image even under extreme mask settings. Experiments on public benchmarks validate our design choices as the proposed method outperforms strong baselines in both visual quality and diversity metrics.},
keywords = {DTIC, Graphics, VGL},
pubstate = {published},
tppubtype = {inproceedings}
}