Fix thread support (#640)

This PR fixes several things related to threads emulation:

1. Support `SameTebFlags.InitialThread`
This flag is needed to support emulation of .net executables (not yet
fully supported) that don't have an entry point set in `PE` header. This
applies to both `PE32` and `PE64` executables. If `InitialThread` is set
the loader substitutes an entry point of the .net executable with
`mscoree.dll!_CorExeMain`.

2. Fix static thread local storage for `WOW64`
This fix resolves `shell32.dll` initialization on `WOW64`. This fix also
uses correct structure and field names that are obtained from the
corresponding `.pdb` files.

3. Fix dynamic thread local storage for `WOW64`

4. Fix setting argument of a `WOW64` thread start proc

5. Fix creating suspended thread and parse create_flags
Currently creating suspended thread doesn't work because
`NtCreateThreadEx` handler uses invalid flag `CREATE_SUSPENDED`. This PR
fixes that, and moreover it carefully parses create_flags of the
`NtCreateThreadEx` call.

6. Fix `FS` and `GS` handling
This PR fixes several problems with `GS` and `FS` segments:

    * Wrong GDT descriptor for selector 0x53
* Update GDT descriptor for selector 0x53 for a `WOW64` process every
context switch like Windows does
* Set `GS` base when `GS` segment register is updated in 64-bit code
(code selector is `0x33`). When `GS` segment register is loaded with
correct selector (`0x2b`) `GS` base is set to 0. So, when the code
accesses something like `gs:[0]`, a page fault occurs. `KiPageFault`
handles this situation and sets correct `GS` base.

Also, take into account that `teb64.ExceptionList` initially contains
`teb32` address for `WOW64` process. This is used to setup `FS` base
when `wrfsbase` instruction is available. We can enable this instruction
using `kusd.ProcessorFeatures.arr[PF_RDWRFSGSBASE_AVAILABLE] = 1;` and
this work perfectly with `unicorn` backend. Unfortunately `icicle`
backend does not support `wrfsbase`, so I don't enable this instruction
by default.
This commit is contained in:
Maurice Heumann
2025-12-30 17:56:30 +01:00
committed by GitHub
14 changed files with 241 additions and 159 deletions

View File

@@ -19,7 +19,7 @@ namespace
descriptor |= ((base & 0xFF0000) << 16); // Base[23:16]
descriptor |= (0xF3ULL << 40); // P=1, DPL=3, S=1, Type=3 (Data RW Accessed)
descriptor |= (static_cast<uint64_t>((limit & 0xF0000) >> 16) << 48); // Limit[19:16]
descriptor |= (0x40ULL << 52); // G=0 (byte), D=1 (32-bit), L=0, AVL=0
descriptor |= (0x40ULL << 48); // G=0 (byte), D=1 (32-bit), L=0, AVL=0
descriptor |= ((base & 0xFF000000) << 32); // Base[31:24]
// Write the updated descriptor to GDT index 10 (selector 0x53)
@@ -116,15 +116,18 @@ namespace
}
emulator_thread::emulator_thread(memory_manager& memory, const process_context& context, const uint64_t start_address,
const uint64_t argument, const uint64_t stack_size, const bool suspended, const uint32_t id)
const uint64_t argument, const uint64_t stack_size, const uint32_t create_flags, const uint32_t id,
const bool initial_thread)
: memory_ptr(&memory),
// stack_size(page_align_up(std::max(stack_size, static_cast<uint64_t>(STACK_SIZE)))),
start_address(start_address),
argument(argument),
id(id),
suspended(suspended),
create_flags(create_flags),
last_registers(context.default_register_set)
{
this->suspended = create_flags & THREAD_CREATE_FLAGS_CREATE_SUSPENDED;
// native 64-bit
if (!context.is_wow64_process)
{
@@ -152,6 +155,10 @@ emulator_thread::emulator_thread(memory_manager& memory, const process_context&
teb_obj.NtTib.Self = this->teb64->value();
teb_obj.CurrentLocale = 0x409;
teb_obj.ProcessEnvironmentBlock = context.peb64.value();
teb_obj.SameTebFlags.InitialThread = initial_thread;
teb_obj.SameTebFlags.SkipThreadAttach = (create_flags & THREAD_CREATE_FLAGS_SKIP_THREAD_ATTACH) ? 1 : 0;
teb_obj.SameTebFlags.LoaderWorker = (create_flags & THREAD_CREATE_FLAGS_LOADER_WORKER) ? 1 : 0;
teb_obj.SameTebFlags.SkipLoaderInit = (create_flags & THREAD_CREATE_FLAGS_SKIP_LOADER_INIT) ? 1 : 0;
});
return;
@@ -209,6 +216,10 @@ emulator_thread::emulator_thread(memory_manager& memory, const process_context&
teb_obj.CurrentLocale = 0x409;
teb_obj.ProcessEnvironmentBlock = context.peb64.value();
teb_obj.SameTebFlags.InitialThread = initial_thread;
teb_obj.SameTebFlags.SkipThreadAttach = (create_flags & THREAD_CREATE_FLAGS_SKIP_THREAD_ATTACH) ? 1 : 0;
teb_obj.SameTebFlags.LoaderWorker = (create_flags & THREAD_CREATE_FLAGS_LOADER_WORKER) ? 1 : 0;
teb_obj.SameTebFlags.SkipLoaderInit = (create_flags & THREAD_CREATE_FLAGS_SKIP_LOADER_INIT) ? 1 : 0;
teb_obj.StaticUnicodeString.MaximumLength = sizeof(teb_obj.StaticUnicodeBuffer);
teb_obj.StaticUnicodeString.Buffer = this->teb64->value() + offsetof(TEB64, StaticUnicodeBuffer);
@@ -264,18 +275,18 @@ emulator_thread::emulator_thread(memory_manager& memory, const process_context&
}
teb32_obj.WowTebOffset = -0x2000;
teb32_obj.InitialThread = initial_thread;
teb32_obj.SkipThreadAttach = (create_flags & THREAD_CREATE_FLAGS_SKIP_THREAD_ATTACH) ? 1 : 0;
teb32_obj.LoaderWorker = (create_flags & THREAD_CREATE_FLAGS_LOADER_WORKER) ? 1 : 0;
teb32_obj.SkipLoaderInit = (create_flags & THREAD_CREATE_FLAGS_SKIP_LOADER_INIT) ? 1 : 0;
// Note: CurrentLocale and other fields will be initialized by WOW64 runtime
});
// CRITICAL: Setup FS segment (0x53) to point to 32-bit TEB for accurate WOW64 emulation
// This mimics what Windows kernel does during NtCreateUserProcess for WOW64 processes
// Without this, FS:0 won't correctly access the 32-bit TEB
//
// NOTE: We cannot use set_segment_base() here because that sets the FS_BASE MSR
// which is for 64-bit flat addressing. 32-bit code uses actual GDT-based segmentation
// with selector 0x53, so we must modify the GDT entry directly.
setup_wow64_fs_segment(memory, teb32_addr);
this->teb64->access([&](TEB64& teb_obj) {
// teb64.ExceptionList initially points to teb32
teb_obj.NtTib.ExceptionList = teb32_addr;
});
// Use the allocator to reserve memory for CONTEXT64
this->wow64_cpu_reserved = emulator_object<WOW64_CPURESERVED>{memory, wow64_cpureserved_base};
@@ -439,6 +450,7 @@ void emulator_thread::setup_registers(x86_64_emulator& emu, const process_contex
if (context.rtl_user_thread_start32.has_value())
{
ctx.Context.Eip = static_cast<uint32_t>(context.rtl_user_thread_start32.value());
ctx.Context.Ebx = static_cast<uint32_t>(this->argument);
}
});
@@ -473,3 +485,14 @@ void emulator_thread::setup_registers(x86_64_emulator& emu, const process_contex
emu.reg(x86_register::rdx, context.ntdll_image_base);
emu.reg(x86_register::rip, context.ldr_initialize_thunk);
}
void emulator_thread::refresh_execution_context(x86_64_emulator& emu) const
{
(void)emu;
if (this->teb32.has_value())
{
// Refresh GDT entry for FS selector on context switch
setup_wow64_fs_segment(*this->memory_ptr, this->teb32->value());
}
}

View File

@@ -49,7 +49,7 @@ class emulator_thread : public ref_counted_object
}
emulator_thread(memory_manager& memory, const process_context& context, uint64_t start_address, uint64_t argument, uint64_t stack_size,
bool suspended, uint32_t id);
uint32_t create_flags, uint32_t id, bool initial_thread);
emulator_thread(const emulator_thread&) = delete;
emulator_thread& operator=(const emulator_thread&) = delete;
@@ -86,6 +86,7 @@ class emulator_thread : public ref_counted_object
bool await_any{false};
bool waiting_for_alert{false};
bool alerted{false};
uint32_t create_flags{0};
uint32_t suspended{0};
std::optional<std::chrono::steady_clock::time_point> await_time{};
@@ -123,6 +124,7 @@ class emulator_thread : public ref_counted_object
void restore(x86_64_emulator& emu) const
{
emu.restore_registers(this->last_registers);
this->refresh_execution_context(emu);
}
void setup_if_necessary(x86_64_emulator& emu, const process_context& context)
@@ -166,6 +168,7 @@ class emulator_thread : public ref_counted_object
buffer.write(this->waiting_for_alert);
buffer.write(this->alerted);
buffer.write(this->create_flags);
buffer.write(this->suspended);
buffer.write_optional(this->await_time);
@@ -213,6 +216,7 @@ class emulator_thread : public ref_counted_object
buffer.read(this->waiting_for_alert);
buffer.read(this->alerted);
buffer.read(this->create_flags);
buffer.read(this->suspended);
buffer.read_optional(this->await_time);
@@ -245,6 +249,7 @@ class emulator_thread : public ref_counted_object
private:
void setup_registers(x86_64_emulator& emu, const process_context& context) const;
void refresh_execution_context(x86_64_emulator& emu) const;
void release()
{

View File

@@ -224,12 +224,6 @@ mapped_module* module_manager::map_module_core(const pe_detection_result& detect
const auto image_base = mod.image_base;
const auto entry = this->modules_.try_emplace(image_base, std::move(mod));
this->last_module_cache_ = this->modules_.end();
// TODO: Patch shell32.dll entry point to prevent TLS storage issues
// The shell32.dll module in SysWOW64 has TLS storage that fails, causing crashes
// This is a temporary workaround until the root cause is investigated and fixed
this->patch_shell32_entry_point_if_needed(entry.first->second);
this->callbacks_->on_module_load(entry.first->second);
return &entry.first->second;
}
@@ -573,49 +567,3 @@ bool module_manager::unmap(const uint64_t address)
return true;
}
void module_manager::patch_shell32_entry_point_if_needed(mapped_module& mod)
{
// Only patch shell32.dll in SysWOW64 directory (32-bit)
// Convert module name to lowercase for case-insensitive comparison
std::string module_name_lower = mod.name;
std::transform(module_name_lower.begin(), module_name_lower.end(), module_name_lower.begin(),
[](unsigned char c) { return static_cast<char>(std::tolower(c)); });
if (module_name_lower != "shell32.dll")
{
return;
}
// Check if this is the SysWOW64 version by examining if it's a 32-bit module
// Convert path to lowercase for case-insensitive comparison
std::string path_str = mod.path.string();
std::transform(path_str.begin(), path_str.end(), path_str.begin(), [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
if (path_str.find("syswow64") == std::string::npos)
{
return;
}
if (mod.entry_point == 0)
{
return;
}
// Get the page containing the entry point
const auto entry_page_start = mod.entry_point & ~0xFFFULL;
const auto page_size = 0x1000;
// Temporarily change memory protection to writable
nt_memory_permission mem_permisson(memory_permission::none);
if (!this->memory_->protect_memory(entry_page_start, page_size, memory_permission::all, &mem_permisson))
{
return; // Failed to change protection
}
// Write the ret 0Ch instruction at the entry point (0xB8, 0x01, 0x00, 0x00, 0x00, 0xC2, 0x0C, 0x00)
// This makes DllMain return immediately without executing CRT startup
constexpr std::array<uint8_t, 8> patch_bytes = {0xB8, 0x01, 0x00, 0x00, 0x00, 0xC2, 0x0C, 0x00}; // mov eax, 1 && ret 0Ch
this->memory_->write_memory(mod.entry_point, patch_bytes.data(), patch_bytes.size());
// Restore the original memory protection
this->memory_->protect_memory(entry_page_start, page_size, mem_permisson, nullptr);
}

View File

@@ -183,9 +183,6 @@ class module_manager
mapped_module* map_module_core(const pe_detection_result& detection_result, const std::function<mapped_module()>& mapper,
const logger& logger, bool is_static);
// Shell32.dll entry point patching to prevent TLS storage issues
void patch_shell32_entry_point_if_needed(mapped_module& mod);
// Execution mode detection
execution_mode detect_execution_mode(const windows_path& executable_path, const logger& logger);

View File

@@ -528,9 +528,9 @@ generic_handle_store* process_context::get_handle_store(const handle handle)
}
handle process_context::create_thread(memory_manager& memory, const uint64_t start_address, const uint64_t argument,
const uint64_t stack_size, const bool suspended)
const uint64_t stack_size, const uint32_t create_flags, const bool initial_thread)
{
emulator_thread t{memory, *this, start_address, argument, stack_size, suspended, ++this->spawned_thread_count};
emulator_thread t{memory, *this, start_address, argument, stack_size, create_flags, ++this->spawned_thread_count, initial_thread};
auto [h, thr] = this->threads.store_and_get(std::move(t));
this->callbacks_->on_thread_create(h, *thr);
return h;

View File

@@ -74,7 +74,8 @@ struct process_context
const mapped_module& executable, const mapped_module& ntdll, const apiset::container& apiset_container,
const mapped_module* ntdll32 = nullptr);
handle create_thread(memory_manager& memory, uint64_t start_address, uint64_t argument, uint64_t stack_size, bool suspended);
handle create_thread(memory_manager& memory, uint64_t start_address, uint64_t argument, uint64_t stack_size, uint32_t create_flags,
bool initial_thread = false);
std::optional<uint16_t> find_atom(std::u16string_view name);
uint16_t add_or_find_atom(std::u16string name);

View File

@@ -214,15 +214,17 @@ namespace syscalls
if (info_class == ProcessTlsInformation)
{
constexpr auto thread_data_offset = offsetof(PROCESS_TLS_INFO, ThreadData);
if (process_information_length < thread_data_offset)
if (process_information_length < sizeof(PROCESS_TLS_INFORMATION) ||
(process_information_length - (sizeof(PROCESS_TLS_INFORMATION) - sizeof(THREAD_TLS_INFORMATION))) %
sizeof(THREAD_TLS_INFORMATION))
{
return STATUS_BUFFER_OVERFLOW;
return STATUS_INFO_LENGTH_MISMATCH;
}
const emulator_object<THREAD_TLS_INFO> data{c.emu, process_information + thread_data_offset};
constexpr auto thread_data_offset = offsetof(PROCESS_TLS_INFORMATION, ThreadData);
const emulator_object<THREAD_TLS_INFORMATION> data{c.emu, process_information + thread_data_offset};
PROCESS_TLS_INFO tls_info{};
PROCESS_TLS_INFORMATION tls_info{};
c.emu.read_memory(process_information, &tls_info, thread_data_offset);
for (uint32_t i = 0; i < tls_info.ThreadDataCount; ++i)
@@ -242,38 +244,78 @@ namespace syscalls
entry.Flags = 2;
thread_iterator->second.teb64->access([&](TEB64& teb) {
const auto is_wow64 = c.win_emu.process.is_wow64_process;
const auto& thread = thread_iterator->second;
thread.teb64->access([&](TEB64& teb) {
entry.ThreadId = teb.ClientId.UniqueThread;
const auto tls_vector = teb.ThreadLocalStoragePointer;
constexpr auto ptr_size = sizeof(EmulatorTraits<Emu64>::PVOID);
uint64_t tls_vector = teb.ThreadLocalStoragePointer;
const auto ptr_size = is_wow64 ? sizeof(EmulatorTraits<Emu32>::PVOID) : sizeof(EmulatorTraits<Emu64>::PVOID);
if (is_wow64)
{
if (!thread.teb32.has_value())
{
return;
}
thread.teb32->access([&tls_vector](const TEB32& teb32) { tls_vector = teb32.ThreadLocalStoragePointer; });
}
if (!tls_vector)
{
return;
}
if (tls_info.TlsRequest == ProcessTlsReplaceIndex)
if (tls_info.OperationType == ProcessTlsReplaceIndex)
{
const auto tls_entry_ptr = tls_vector + (tls_info.TlsIndex * ptr_size);
uint64_t old_entry{};
const auto old_entry = c.emu.read_memory<EmulatorTraits<Emu64>::PVOID>(tls_entry_ptr);
c.emu.write_memory<EmulatorTraits<Emu64>::PVOID>(tls_entry_ptr, entry.TlsModulePointer);
entry.TlsModulePointer = old_entry;
}
else if (tls_info.TlsRequest == ProcessTlsReplaceVector)
{
const auto new_tls_vector = entry.TlsVector;
for (uint32_t index = 0; index < tls_info.TlsVectorLength; ++index)
if (is_wow64)
{
const auto old_entry = c.emu.read_memory<uint64_t>(tls_vector + index * ptr_size);
c.emu.write_memory(new_tls_vector + index * ptr_size, old_entry);
old_entry = c.emu.read_memory<EmulatorTraits<Emu32>::PVOID>(tls_entry_ptr);
c.emu.write_memory<EmulatorTraits<Emu32>::PVOID>(tls_entry_ptr, static_cast<uint32_t>(entry.NewTlsData));
}
else
{
old_entry = c.emu.read_memory<EmulatorTraits<Emu64>::PVOID>(tls_entry_ptr);
c.emu.write_memory<EmulatorTraits<Emu64>::PVOID>(tls_entry_ptr, entry.NewTlsData);
}
teb.ThreadLocalStoragePointer = new_tls_vector;
entry.TlsVector = tls_vector;
entry.OldTlsData = old_entry;
}
else if (tls_info.OperationType == ProcessTlsReplaceVector)
{
const auto new_tls_vector = entry.NewTlsData;
for (uint32_t index = 0; index < tls_info.PreviousCount; ++index)
{
if (is_wow64)
{
const auto old_entry = c.emu.read_memory<uint32_t>(tls_vector + (index * ptr_size));
c.emu.write_memory(new_tls_vector + (index * ptr_size), old_entry);
}
else
{
const auto old_entry = c.emu.read_memory<uint64_t>(tls_vector + (index * ptr_size));
c.emu.write_memory(new_tls_vector + (index * ptr_size), old_entry);
}
}
if (is_wow64)
{
thread.teb32->access([&new_tls_vector](TEB32& teb32) {
teb32.ThreadLocalStoragePointer = static_cast<uint32_t>(new_tls_vector);
});
}
else
{
teb.ThreadLocalStoragePointer = new_tls_vector;
}
entry.OldTlsData = tls_vector;
}
});
}

View File

@@ -103,17 +103,44 @@ namespace syscalls
for (const auto& t : c.proc.threads | std::views::values)
{
t.teb64->access([&](TEB64& teb) {
if (tls_cell < TLS_MINIMUM_AVAILABLE)
if (tls_cell < TLS_MINIMUM_AVAILABLE)
{
if (c.proc.is_wow64_process)
{
teb.TlsSlots.arr[tls_cell] = 0;
if (t.teb32.has_value())
{
t.teb32->access([&](TEB32& teb32) { teb32.TlsSlots.arr[tls_cell] = 0; });
}
}
else if (teb.TlsExpansionSlots)
else
{
const emulator_object<emulator_pointer> expansion_slots(c.emu, teb.TlsExpansionSlots);
expansion_slots.write(0, tls_cell - TLS_MINIMUM_AVAILABLE);
t.teb64->access([&](TEB64& teb64) { teb64.TlsSlots.arr[tls_cell] = 0; });
}
});
}
else if (tls_cell < TLS_MINIMUM_AVAILABLE + TLS_EXPANSION_SLOTS)
{
if (c.proc.is_wow64_process)
{
if (t.teb32.has_value())
{
t.teb32->access([&](TEB32& teb32) {
if (teb32.TlsExpansionSlots)
{
c.emu.write_memory<uint32_t>(teb32.TlsExpansionSlots + (4 * tls_cell) - TLS_MINIMUM_AVAILABLE, 0);
}
});
}
}
else
{
t.teb64->access([&](TEB64& teb64) {
if (teb64.TlsExpansionSlots)
{
c.emu.write_memory<uint64_t>(teb64.TlsExpansionSlots + (8 * tls_cell) - TLS_MINIMUM_AVAILABLE, 0);
}
});
}
}
}
return STATUS_SUCCESS;
@@ -601,7 +628,7 @@ namespace syscalls
return STATUS_NOT_SUPPORTED;
}
const auto h = c.proc.create_thread(c.win_emu.memory, start_routine, argument, stack_size, create_flags & CREATE_SUSPENDED);
const auto h = c.proc.create_thread(c.win_emu.memory, start_routine, argument, stack_size, create_flags);
thread_handle.write(h);
if (!attribute_list)

View File

@@ -371,7 +371,7 @@ void windows_emulator::setup_process(const application_settings& app_settings)
this->dispatcher.setup(ntdll->exports, ntdll_data, win32u->exports, win32u_data);
const auto main_thread_id = context.create_thread(this->memory, this->mod_manager.executable->entry_point, 0,
this->mod_manager.executable->size_of_stack_commit, false);
this->mod_manager.executable->size_of_stack_commit, 0, true);
switch_to_thread(*this, main_thread_id);
}
@@ -517,6 +517,18 @@ void windows_emulator::setup_hooks()
this->emu().hook_memory_violation(
[&](const uint64_t address, const size_t size, const memory_operation operation, const memory_violation_type type) {
if (this->emu().reg<uint16_t>(x86_register::cs) == 0x33)
{
// loading gs selector only works in 64-bit mode
const auto required_gs_base = this->current_thread().gs_segment->get_base();
const auto actual_gs_base = this->emu().get_segment_base(x86_register::gs);
if (actual_gs_base != required_gs_base)
{
this->emu().set_segment_base(x86_register::gs, required_gs_base);
return memory_violation_continuation::restart;
}
}
auto region = this->memory.get_region_info(address);
if (region.permissions.is_guarded())
{