dubious timing optimizations; try to sleep between frames; skip at constant speed
This commit is contained in:
parent
64878d5c67
commit
69079f245a
16 changed files with 217 additions and 124 deletions
|
@ -173,22 +173,12 @@ Timing
|
|||
less accurate.
|
||||
|
||||
**TAISEI_FRAMELIMITER_SLEEP**
|
||||
| Default: ``0``
|
||||
| Default: ``3``
|
||||
|
||||
If over ``0``, tries to sleep this many milliseconds after every frame
|
||||
if it was processed quickly enough. This reduces CPU usage by having the
|
||||
game spend less time in a busy loop, but may hurt framerate stability if
|
||||
set too high, especially if the high resolution timer is disabled or
|
||||
not available.
|
||||
|
||||
**TAISEI_FRAMELIMITER_SLEEP_EXACT**
|
||||
| Default: ``1``
|
||||
|
||||
If ``1``, the framerate limiter will either try to sleep the exact
|
||||
amount of time set in ``TAISEI_FRAMELIMITER_SLEEP``, or none at all.
|
||||
Mitigates the aforementioned framerate stability issues by effectively
|
||||
making ``TAISEI_FRAMELIMITER_SLEEP`` do nothing if the value is too high
|
||||
for your system.
|
||||
If over ``0``, tries to give up processing time to other applications
|
||||
while waiting for the next frame, if at least ``frame_time / this_value``
|
||||
amount of time is remaining. Increasesing this value reduces CPU usage,
|
||||
but may harm performance. Set to ``0`` for the v1.2 default behaviour.
|
||||
|
||||
**TAISEI_FRAMELIMITER_COMPENSATE**
|
||||
| Default: ``1``
|
||||
|
|
|
@ -169,6 +169,8 @@ if not (have_vla and have_complex)
|
|||
endif
|
||||
|
||||
config.set('TAISEI_BUILDCONF_HAVE_TIMESPEC', have_timespec)
|
||||
config.set('TAISEI_BUILDCONF_HAVE_INT128', cc.sizeof('__int128') == 16)
|
||||
config.set('TAISEI_BUILDCONF_HAVE_LONG_DOUBLE', cc.sizeof('long double') > 8)
|
||||
|
||||
macos_app_bundle = get_option('macos_bundle') and host_machine.system() == 'darwin'
|
||||
|
||||
|
|
|
@ -80,7 +80,6 @@
|
|||
CONFIGDEF_INT (VID_WIDTH, "vid_width", RESX) \
|
||||
CONFIGDEF_INT (VID_HEIGHT, "vid_height", RESY) \
|
||||
CONFIGDEF_INT (VID_RESIZABLE, "vid_resizable", 0) \
|
||||
CONFIGDEF_INT (VID_LATE_SWAP, "vid_late_swap", 0) \
|
||||
CONFIGDEF_INT (VID_FRAMESKIP, "vid_frameskip", 1) \
|
||||
CONFIGDEF_INT (VSYNC, "vsync", 0) \
|
||||
CONFIGDEF_INT (MIXER_CHUNKSIZE, "mixer_chunksize", 1024) \
|
||||
|
@ -100,6 +99,7 @@
|
|||
CONFIGDEF_INT (FXAA, "fxaa", 1) \
|
||||
CONFIGDEF_INT (POSTPROCESS, "postprocess", 2) \
|
||||
CONFIGDEF_INT (HEALTHBAR_STYLE, "healthbar_style", 1) \
|
||||
CONFIGDEF_INT (SKIP_SPEED, "skip_speed", 10) \
|
||||
KEYDEFS \
|
||||
CONFIGDEF_INT (GAMEPAD_ENABLED, "gamepad_enabled", 0) \
|
||||
CONFIGDEF_STRING (GAMEPAD_DEVICE, "gamepad_device", "default") \
|
||||
|
|
|
@ -286,7 +286,7 @@ void events_emit(TaiseiEvent type, int32_t code, void *data1, void *data2) {
|
|||
void events_pause_keyrepeat(void) {
|
||||
// workaround for SDL bug
|
||||
// https://bugzilla.libsdl.org/show_bug.cgi?id=3287
|
||||
keyrepeat_paused_until = time_get() + 0.25;
|
||||
keyrepeat_paused_until = time_get() + HRTIME_RESOLUTION / 4;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
101
src/framerate.c
101
src/framerate.c
|
@ -13,14 +13,15 @@
|
|||
#include "video.h"
|
||||
|
||||
void fpscounter_reset(FPSCounter *fps) {
|
||||
hrtime_t frametime = 1.0 / FPS;
|
||||
hrtime_t frametime = HRTIME_RESOLUTION / FPS;
|
||||
const int log_size = sizeof(fps->frametimes)/sizeof(hrtime_t);
|
||||
|
||||
for(int i = 0; i < log_size; ++i) {
|
||||
fps->frametimes[i] = frametime;
|
||||
}
|
||||
|
||||
fps->fps = 1.0 / frametime;
|
||||
fps->fps = HRTIME_RESOLUTION / (long double)frametime;
|
||||
fps->frametime = frametime;
|
||||
fps->last_update_time = time_get();
|
||||
}
|
||||
|
||||
|
@ -31,13 +32,14 @@ void fpscounter_update(FPSCounter *fps) {
|
|||
memmove(fps->frametimes, fps->frametimes + 1, (log_size - 1) * sizeof(hrtime_t));
|
||||
fps->frametimes[log_size - 1] = frametime;
|
||||
|
||||
hrtime_t avg = 0.0;
|
||||
hrtime_t avg = 0;
|
||||
|
||||
for(int i = 0; i < log_size; ++i) {
|
||||
avg += fps->frametimes[i];
|
||||
}
|
||||
|
||||
fps->fps = 1.0 / (avg / log_size);
|
||||
fps->fps = HRTIME_RESOLUTION / (avg / (long double)log_size);
|
||||
fps->frametime = avg / log_size;
|
||||
fps->last_update_time = time_get();
|
||||
}
|
||||
|
||||
|
@ -60,20 +62,17 @@ void loop_at_fps(LogicFrameFunc logic_frame, RenderFrameFunc render_frame, void
|
|||
|
||||
hrtime_t frame_start_time = time_get();
|
||||
hrtime_t next_frame_time = frame_start_time;
|
||||
hrtime_t target_frame_time = ((hrtime_t)1.0) / fps;
|
||||
hrtime_t target_frame_time = HRTIME_RESOLUTION / fps;
|
||||
|
||||
FrameAction rframe_action = RFRAME_SWAP;
|
||||
FrameAction lframe_action = LFRAME_WAIT;
|
||||
|
||||
int32_t delay = env_get("TAISEI_FRAMELIMITER_SLEEP", 0);
|
||||
bool exact_delay = env_get("TAISEI_FRAMELIMITER_SLEEP_EXACT", 1);
|
||||
int32_t sleep = env_get("TAISEI_FRAMELIMITER_SLEEP", 3);
|
||||
bool compensate = env_get("TAISEI_FRAMELIMITER_COMPENSATE", 1);
|
||||
bool uncapped_rendering_env = env_get("TAISEI_FRAMELIMITER_LOGIC_ONLY", 0);
|
||||
bool late_swap = config_get_int(CONFIG_VID_LATE_SWAP);
|
||||
|
||||
if(global.is_replay_verification) {
|
||||
uncapped_rendering_env = false;
|
||||
delay = 0;
|
||||
}
|
||||
|
||||
uint32_t frame_num = 0;
|
||||
|
@ -82,29 +81,12 @@ void loop_at_fps(LogicFrameFunc logic_frame, RenderFrameFunc render_frame, void
|
|||
static uint8_t recursion_detector;
|
||||
++recursion_detector;
|
||||
|
||||
#ifdef SPAM_FPS
|
||||
hrtime_t frametimes[4096];
|
||||
int frametimes_idx = 0;
|
||||
#endif
|
||||
|
||||
while(true) {
|
||||
bool uncapped_rendering = uncapped_rendering_env;
|
||||
frame_start_time = time_get();
|
||||
|
||||
begin_frame:
|
||||
|
||||
#ifdef DEBUG
|
||||
if(gamekeypressed(KEY_FPSLIMIT_OFF)) {
|
||||
uncapped_rendering = false;
|
||||
} else {
|
||||
uncapped_rendering = uncapped_rendering_env;
|
||||
}
|
||||
#endif
|
||||
|
||||
if(late_swap && rframe_action == RFRAME_SWAP) {
|
||||
video_swap_buffers();
|
||||
}
|
||||
|
||||
global.fps.busy.last_update_time = time_get();
|
||||
|
||||
++frame_num;
|
||||
|
@ -132,7 +114,7 @@ begin_frame:
|
|||
hrtime_t frametime = target_frame_time;
|
||||
|
||||
if(lframe_action == LFRAME_SKIP) {
|
||||
frametime *= 0.1;
|
||||
frametime /= imax(1, config_get_int(CONFIG_SKIP_SPEED));
|
||||
}
|
||||
|
||||
next_frame_time += frametime;
|
||||
|
@ -141,7 +123,7 @@ begin_frame:
|
|||
|
||||
if(total > target_frame_time) {
|
||||
next_frame_time = frame_start_time;
|
||||
log_debug("Executing logic took too long (%f), giving up", (double)total);
|
||||
log_debug("Executing logic took too long (%"PRIuTIME"), giving up", total);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -154,8 +136,12 @@ begin_frame:
|
|||
);
|
||||
}
|
||||
} else {
|
||||
lframe_action = logic_frame(arg);
|
||||
fpscounter_update(&global.fps.logic);
|
||||
uint cnt = 0;
|
||||
|
||||
do {
|
||||
lframe_action = logic_frame(arg);
|
||||
fpscounter_update(&global.fps.logic);
|
||||
} while(lframe_action == LFRAME_SKIP && ++cnt < config_get_int(CONFIG_SKIP_SPEED));
|
||||
}
|
||||
|
||||
if(taisei_quit_requested()) {
|
||||
|
@ -168,35 +154,19 @@ begin_frame:
|
|||
r_framebuffer_clear(NULL, CLEAR_ALL, RGBA(0, 0, 0, 1), 1);
|
||||
rframe_action = render_frame(arg);
|
||||
fpscounter_update(&global.fps.render);
|
||||
|
||||
#ifdef SPAM_FPS
|
||||
frametimes[frametimes_idx++] = *global.fps.render.frametimes;
|
||||
size_t s = sizeof(frametimes)/sizeof(*frametimes);
|
||||
|
||||
if(frametimes_idx == s) {
|
||||
hrtime_t total = 0;
|
||||
|
||||
for(int i = 0; i < s; ++i) {
|
||||
total += frametimes[i];
|
||||
}
|
||||
|
||||
frametimes_idx = 0;
|
||||
log_info("%zi frames in %.2fs = %.2f FPS", s, (double)total, (double)(1 / (total / s)));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
if(lframe_action == LFRAME_STOP) {
|
||||
break;
|
||||
}
|
||||
|
||||
if(!late_swap && rframe_action == RFRAME_SWAP) {
|
||||
if(rframe_action == RFRAME_SWAP) {
|
||||
video_swap_buffers();
|
||||
}
|
||||
|
||||
fpscounter_update(&global.fps.busy);
|
||||
|
||||
if(lframe_action == LFRAME_SKIP || uncapped_rendering) {
|
||||
if(/*lframe_action == LFRAME_SKIP ||*/ uncapped_rendering) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -207,40 +177,31 @@ begin_frame:
|
|||
#endif
|
||||
|
||||
next_frame_time = frame_start_time + target_frame_time;
|
||||
// next_frame_time = frame_start_time + 2 * target_frame_time - global.fps.logic.frametime;
|
||||
|
||||
if(compensate) {
|
||||
hrtime_t rt = time_get();
|
||||
hrtime_t diff = rt - next_frame_time;
|
||||
|
||||
if(diff >= 0) {
|
||||
if(rt > next_frame_time) {
|
||||
// frame took too long...
|
||||
// try to compensate in the next frame to avoid slowdown
|
||||
frame_start_time = rt - min(diff, target_frame_time);
|
||||
frame_start_time = rt - imin(rt - next_frame_time, target_frame_time);
|
||||
goto begin_frame;
|
||||
}
|
||||
}
|
||||
|
||||
if(delay > 0) {
|
||||
int32_t realdelay = delay;
|
||||
int32_t maxdelay = (int32_t)(1000 * (next_frame_time - time_get()));
|
||||
|
||||
if(realdelay > maxdelay) {
|
||||
if(exact_delay) {
|
||||
log_debug("Delay of %i ignored. Maximum is %i, TAISEI_FRAMELIMITER_SLEEP_EXACT is active", realdelay, maxdelay);
|
||||
realdelay = 0;
|
||||
} else {
|
||||
log_debug("Delay reduced from %i to %i", realdelay, maxdelay);
|
||||
realdelay = maxdelay;
|
||||
}
|
||||
}
|
||||
|
||||
if(realdelay > 0) {
|
||||
SDL_Delay(realdelay);
|
||||
if(sleep > 0) {
|
||||
// CAUTION: All of these casts are important!
|
||||
while((shrtime_t)next_frame_time - (shrtime_t)time_get() > (shrtime_t)target_frame_time / sleep) {
|
||||
uint32_t nap_multiplier = 1;
|
||||
uint32_t nap_divisor = 3;
|
||||
hrtime_t nap_raw = imax(0, (shrtime_t)next_frame_time - (shrtime_t)time_get());
|
||||
uint32_t nap_sdl = (nap_multiplier * nap_raw * 1000) / (HRTIME_RESOLUTION * nap_divisor);
|
||||
nap_sdl = imax(nap_sdl, 1);
|
||||
SDL_Delay(nap_sdl);
|
||||
}
|
||||
}
|
||||
|
||||
while(time_get() < next_frame_time) {
|
||||
continue;
|
||||
}
|
||||
while(time_get() < next_frame_time);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
typedef struct {
|
||||
hrtime_t frametimes[120]; // size = number of frames to average
|
||||
double fps; // average fps over the last X frames
|
||||
hrtime_t frametime; // average frame time over the last X frames;
|
||||
hrtime_t last_update_time; // internal; last time the average was recalculated
|
||||
} FPSCounter;
|
||||
|
||||
|
|
|
@ -16,7 +16,13 @@ static hrtime_t time_current;
|
|||
static hrtime_t time_offset;
|
||||
static uint64_t prev_hires_time;
|
||||
static uint64_t prev_hires_freq;
|
||||
static SDL_mutex *paranoia;
|
||||
static uint64_t fast_path_mul;
|
||||
|
||||
static inline attr_must_inline void set_freq(uint64_t freq) {
|
||||
prev_hires_freq = freq;
|
||||
lldiv_t d = lldiv(HRTIME_RESOLUTION, freq);
|
||||
fast_path_mul = d.quot * (d.rem == 0);
|
||||
}
|
||||
|
||||
static void time_update(void) {
|
||||
bool retry;
|
||||
|
@ -28,18 +34,24 @@ static void time_update(void) {
|
|||
uint64_t cntr = SDL_GetPerformanceCounter();
|
||||
|
||||
if(freq != prev_hires_freq) {
|
||||
log_debug("High resolution timer frequency changed: was %"PRIu64", now %"PRIu64". Saved time offset: %.16Lf", prev_hires_freq, freq, time_offset);
|
||||
log_debug("High resolution timer frequency changed: was %"PRIu64", now %"PRIu64". Saved time offset: %"PRIuTIME"", prev_hires_freq, freq, time_offset);
|
||||
time_offset = time_current;
|
||||
prev_hires_freq = freq;
|
||||
set_freq(freq);
|
||||
prev_hires_time = SDL_GetPerformanceCounter();
|
||||
retry = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
hrtime_t time_new = time_offset + (hrtime_t)(cntr - prev_hires_time) / freq;
|
||||
hrtime_t time_new;
|
||||
|
||||
if(fast_path_mul) {
|
||||
time_new = time_offset + (cntr - prev_hires_time) * fast_path_mul;
|
||||
} else {
|
||||
time_new = time_offset + umuldiv64(cntr - prev_hires_time, HRTIME_RESOLUTION, freq);
|
||||
}
|
||||
|
||||
if(time_new < time_current) {
|
||||
log_warn("BUG: time went backwards. Was %.16Lf, now %.16Lf. Possible cause: your OS sucks spherical objects. Attempting to correct this...", time_current, time_new);
|
||||
log_warn("BUG: time went backwards. Was %"PRIuTIME", now %"PRIuTIME". Possible cause: your OS sucks spherical objects. Attempting to correct this...", time_current, time_new);
|
||||
time_offset = time_current;
|
||||
time_current = 0;
|
||||
prev_hires_time = SDL_GetPerformanceCounter();
|
||||
|
@ -54,15 +66,9 @@ void time_init(void) {
|
|||
use_hires = env_get("TAISEI_HIRES_TIMER", 1);
|
||||
|
||||
if(use_hires) {
|
||||
if(!(paranoia = SDL_CreateMutex())) {
|
||||
log_warn("Not using the system high resolution timer: SDL_CreateMutex() failed: %s", SDL_GetError());
|
||||
use_hires = false;
|
||||
return;
|
||||
}
|
||||
|
||||
log_info("Using the system high resolution timer");
|
||||
prev_hires_time = SDL_GetPerformanceCounter();
|
||||
prev_hires_freq = SDL_GetPerformanceFrequency();
|
||||
set_freq(SDL_GetPerformanceFrequency());
|
||||
} else {
|
||||
log_info("Not using the system high resolution timer: disabled by environment");
|
||||
return;
|
||||
|
@ -70,20 +76,15 @@ void time_init(void) {
|
|||
}
|
||||
|
||||
void time_shutdown(void) {
|
||||
if(paranoia) {
|
||||
SDL_DestroyMutex(paranoia);
|
||||
paranoia = NULL;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
hrtime_t time_get(void) {
|
||||
if(use_hires) {
|
||||
SDL_LockMutex(paranoia);
|
||||
assert(is_main_thread());
|
||||
time_update();
|
||||
hrtime_t t = time_current;
|
||||
SDL_UnlockMutex(paranoia);
|
||||
return t;
|
||||
return time_current;
|
||||
}
|
||||
|
||||
return SDL_GetTicks() / 1000.0;
|
||||
return SDL_GetTicks() * (HRTIME_RESOLUTION / 1000);
|
||||
}
|
||||
|
|
|
@ -9,7 +9,14 @@
|
|||
#pragma once
|
||||
#include "taisei.h"
|
||||
|
||||
typedef long double hrtime_t;
|
||||
typedef uint64_t hrtime_t;
|
||||
typedef int64_t shrtime_t;
|
||||
#define PRIuTIME PRIu64
|
||||
#define PRIdTIME PRId64
|
||||
#define HRTIME_C(value) UINT64_C(value)
|
||||
|
||||
// picoseconds. like super duper accurate, man
|
||||
#define HRTIME_RESOLUTION HRTIME_C(1000000000000)
|
||||
|
||||
void time_init(void);
|
||||
void time_shutdown(void);
|
||||
|
|
|
@ -81,8 +81,12 @@ void sdl_log(void *userdata, int category, SDL_LogPriority priority, const char
|
|||
static void init_sdl(void) {
|
||||
SDL_version v;
|
||||
|
||||
if(SDL_Init(SDL_INIT_EVENTS) < 0)
|
||||
if(SDL_Init(SDL_INIT_EVENTS) < 0) {
|
||||
log_fatal("SDL_Init() failed: %s", SDL_GetError());
|
||||
}
|
||||
|
||||
// initialize it
|
||||
is_main_thread();
|
||||
|
||||
/*
|
||||
* TODO: refine this and make it optional
|
||||
|
|
|
@ -409,13 +409,6 @@ void options_sub_video(MenuData *parent, void *arg) {
|
|||
bind_addvalue(b, "off");
|
||||
bind_addvalue(b, "adaptive");
|
||||
|
||||
#if 0
|
||||
add_menu_entry(m, "Swap buffers", do_nothing,
|
||||
b = bind_option(CONFIG_VID_LATE_SWAP, bind_common_onoff_get, bind_common_onoff_set)
|
||||
); bind_addvalue(b, "late");
|
||||
bind_addvalue(b, "early");
|
||||
#endif
|
||||
|
||||
add_menu_entry(m, "Skip frames", do_nothing,
|
||||
b = bind_option(CONFIG_VID_FRAMESKIP, bind_common_intplus1_get, bind_common_intplus1_set)
|
||||
); bind_addvalue(b, "0");
|
||||
|
|
|
@ -63,8 +63,6 @@ typedef struct ResourceAsyncLoadData {
|
|||
void *opaque;
|
||||
} ResourceAsyncLoadData;
|
||||
|
||||
static SDL_threadID main_thread_id; // TODO: move this somewhere else
|
||||
|
||||
static inline ResourceHandler* get_handler(ResourceType type) {
|
||||
return *(_handlers + type);
|
||||
}
|
||||
|
@ -118,7 +116,7 @@ static void finish_async_load(InternalResource *ires, ResourceAsyncLoadData *dat
|
|||
static ResourceStatus wait_for_resource_load(InternalResource *ires, uint32_t want_flags) {
|
||||
SDL_LockMutex(ires->mutex);
|
||||
|
||||
if(ires->async_task != NULL && SDL_ThreadID() == main_thread_id) {
|
||||
if(ires->async_task != NULL && is_main_thread()) {
|
||||
assert(ires->status == RES_STATUS_LOADING);
|
||||
|
||||
ResourceAsyncLoadData *data;
|
||||
|
@ -189,7 +187,7 @@ static void* load_resource_async_task(void *vdata) {
|
|||
}
|
||||
|
||||
static bool resource_asyncload_handler(SDL_Event *evt, void *arg) {
|
||||
assert(SDL_ThreadID() == main_thread_id);
|
||||
assert(is_main_thread());
|
||||
|
||||
InternalResource *ires = evt->user.data1;
|
||||
|
||||
|
@ -414,8 +412,6 @@ void preload_resources(ResourceType type, ResourceFlags flags, const char *first
|
|||
}
|
||||
|
||||
void init_resources(void) {
|
||||
main_thread_id = SDL_ThreadID();
|
||||
|
||||
for(int i = 0; i < RES_NUMTYPES; ++i) {
|
||||
ResourceHandler *h = get_handler(i);
|
||||
alloc_handler(h);
|
||||
|
|
|
@ -1176,7 +1176,7 @@ void stage_draw_hud_text(struct labels_s* labels) {
|
|||
|
||||
static void fill_graph(int num_samples, float *samples, FPSCounter *fps) {
|
||||
for(int i = 0; i < num_samples; ++i) {
|
||||
samples[i] = fps->frametimes[i] / (((hrtime_t)2.0)/FPS);
|
||||
samples[i] = fps->frametimes[i] / (2.0 * (HRTIME_RESOLUTION / (long double)FPS));
|
||||
|
||||
if(samples[i] > 1.0) {
|
||||
samples[i] = 1.0;
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <SDL_thread.h>
|
||||
|
||||
void* memdup(const void *src, size_t size) {
|
||||
void *data = malloc(size);
|
||||
|
@ -29,3 +30,16 @@ void inherit_missing_pointers(uint num, void *dest[num], void *const base[num])
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool is_main_thread(void) {
|
||||
static bool initialized = false;
|
||||
static SDL_threadID main_thread_id = 0;
|
||||
SDL_threadID tid = SDL_ThreadID();
|
||||
|
||||
if(!initialized) {
|
||||
main_thread_id = tid;
|
||||
}
|
||||
|
||||
return main_thread_id == tid;
|
||||
}
|
||||
|
||||
|
|
|
@ -11,3 +11,4 @@
|
|||
|
||||
void* memdup(const void *src, size_t size);
|
||||
void inherit_missing_pointers(uint num, void *dest[num], void *const base[num]);
|
||||
bool is_main_thread(void);
|
||||
|
|
|
@ -11,6 +11,8 @@
|
|||
#include "miscmath.h"
|
||||
#include "assert.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
double approach(double v, double t, double d) {
|
||||
if(v < t) {
|
||||
v += d;
|
||||
|
@ -218,3 +220,120 @@ uint ipow10(uint n) {
|
|||
assert(n < sizeof(pow10)/sizeof(*pow10));
|
||||
return pow10[n];
|
||||
}
|
||||
|
||||
typedef struct int128_bits {
|
||||
uint64_t hi;
|
||||
uint64_t lo;
|
||||
} int128_bits_t;
|
||||
|
||||
static inline attr_must_inline attr_unused
|
||||
void udiv_128_64(int128_bits_t divident, uint64_t divisor, uint64_t *out_quotient) {
|
||||
/*
|
||||
if(!divident.hi) {
|
||||
*out_quotient = divident.lo / divisor;
|
||||
return;
|
||||
}
|
||||
*/
|
||||
|
||||
uint64_t quotient = divident.lo << 1;
|
||||
uint64_t remainder = divident.hi;
|
||||
uint64_t carry = divident.lo >> 63;
|
||||
uint64_t temp_carry = 0;
|
||||
|
||||
for(int i = 0; i < 64; i++) {
|
||||
temp_carry = remainder >> 63;
|
||||
remainder <<= 1;
|
||||
remainder |= carry;
|
||||
carry = temp_carry;
|
||||
|
||||
if(carry == 0) {
|
||||
if(remainder >= divisor) {
|
||||
carry = 1;
|
||||
} else {
|
||||
temp_carry = quotient >> 63;
|
||||
quotient <<= 1;
|
||||
quotient |= carry;
|
||||
carry = temp_carry;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
remainder -= divisor;
|
||||
remainder -= (1 - carry);
|
||||
carry = 1;
|
||||
temp_carry = quotient >> 63;
|
||||
quotient <<= 1;
|
||||
quotient |= carry;
|
||||
carry = temp_carry;
|
||||
}
|
||||
|
||||
*out_quotient = quotient;
|
||||
}
|
||||
|
||||
static inline attr_must_inline attr_unused
|
||||
void umul_128_64(uint64_t multiplicant, uint64_t multiplier, int128_bits_t *result) {
|
||||
#if (defined(__x86_64) || defined(__x86_64__))
|
||||
__asm__ (
|
||||
"mulq %3"
|
||||
: "=a,a" (result->lo), "=d,d" (result->hi)
|
||||
: "%0,0" (multiplicant), "r,m" (multiplier)
|
||||
: "cc"
|
||||
);
|
||||
#else
|
||||
uint64_t u1 = (multiplicant & 0xffffffff);
|
||||
uint64_t v1 = (multiplier & 0xffffffff);
|
||||
uint64_t t = (u1 * v1);
|
||||
uint64_t w3 = (t & 0xffffffff);
|
||||
uint64_t k = (t >> 32);
|
||||
|
||||
multiplicant >>= 32;
|
||||
t = (multiplicant * v1) + k;
|
||||
k = (t & 0xffffffff);
|
||||
uint64_t w1 = (t >> 32);
|
||||
|
||||
multiplier >>= 32;
|
||||
t = (u1 * multiplier) + k;
|
||||
k = (t >> 32);
|
||||
|
||||
result->hi = (multiplicant * multiplier) + w1 + k;
|
||||
result->lo = (t << 32) + w3;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline attr_must_inline attr_unused
|
||||
uint64_t _umuldiv64_slow(uint64_t x, uint64_t multiplier, uint64_t divisor) {
|
||||
int128_bits_t intermediate;
|
||||
uint64_t result;
|
||||
umul_128_64(x, multiplier, &intermediate);
|
||||
udiv_128_64(intermediate, divisor, &result);
|
||||
return result;
|
||||
}
|
||||
|
||||
#include "util.h"
|
||||
|
||||
static inline attr_must_inline
|
||||
uint64_t _umuldiv64(uint64_t x, uint64_t multiplier, uint64_t divisor) {
|
||||
#if defined(TAISEI_BUILDCONF_HAVE_INT128)
|
||||
__extension__ typedef unsigned __int128 uint128_t;
|
||||
return ((uint128_t)x * (uint128_t)multiplier) / divisor;
|
||||
#elif defined(TAISEI_BUILDCONF_HAVE_LONG_DOUBLE)
|
||||
#define UMULDIV64_SANITY_CHECK
|
||||
return ((long double)x * (long double)multiplier) / (long double)divisor;
|
||||
#else
|
||||
return _umuldiv64_slow(x, multiplier, divisor);
|
||||
#endif
|
||||
}
|
||||
|
||||
uint64_t umuldiv64(uint64_t x, uint64_t multiplier, uint64_t divisor) {
|
||||
#ifdef UMULDIV64_SANITY_CHECK
|
||||
static char sanity = -1;
|
||||
|
||||
if(sanity < 0) {
|
||||
sanity = (_umuldiv64(UINT64_MAX, UINT64_MAX, UINT64_MAX) == UINT64_MAX);
|
||||
}
|
||||
|
||||
return (sanity ? _umuldiv64 : _umuldiv64_slow)(x, multiplier, divisor);
|
||||
#else
|
||||
return _umuldiv64(x, multiplier, divisor);
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -39,6 +39,10 @@ uint ipow10(uint n) attr_const;
|
|||
float normpdf(float x, float sigma) attr_const;
|
||||
void gaussian_kernel_1d(size_t size, float sigma, float kernel[size]) attr_nonnull(3);
|
||||
|
||||
// Compute (a*b)/c with 128-bit intermediate precision.
|
||||
// If the final result would not fit into 64 bits, the return value is undefined.
|
||||
uint64_t umuldiv64(uint64_t x, uint64_t multiplier, uint64_t divisor);
|
||||
|
||||
#define topow2(x) (_Generic((x), \
|
||||
uint32_t: topow2_u32, \
|
||||
uint64_t: topow2_u64, \
|
||||
|
|
Loading…
Reference in a new issue