9 files changed, 246 insertions, 66 deletions
diff --git a/src/common/common_funcs.h b/src/common/common_funcs.h
index bc296ed3e..d56156e4a 100644
--- a/src/common/common_funcs.h
+++ b/src/common/common_funcs.h
@@ -14,8 +14,6 @@
 #define SLEEP(x) usleep(x*1000)
 #endif
 
-template <bool> struct CompileTimeAssert;
-template<> struct CompileTimeAssert<true> {};
 
 #define b2(x)   (   (x) | (   (x) >> 1) )
 #define b4(x)   ( b2(x) | ( b2(x) >> 2) )
diff --git a/src/common/logging/backend.cpp b/src/common/logging/backend.cpp
index 8fee20a83..7c1010b22 100644
--- a/src/common/logging/backend.cpp
+++ b/src/common/logging/backend.cpp
@@ -33,6 +33,7 @@ static std::shared_ptr<Logger> global_logger;
         CLS(Service) \
         SUB(Service, SRV) \
         SUB(Service, FS) \
+        SUB(Service, ERR) \
         SUB(Service, APT) \
         SUB(Service, GSP) \
         SUB(Service, AC) \
diff --git a/src/common/logging/log.h b/src/common/logging/log.h
index 6c5ca3968..7b67b3c07 100644
--- a/src/common/logging/log.h
+++ b/src/common/logging/log.h
@@ -53,6 +53,7 @@ enum class Class : ClassType {
                                 ///  should have its own subclass.
     Service_SRV,                ///< The SRV (Service Directory) implementation
     Service_FS,                 ///< The FS (Filesystem) service implementation
+    Service_ERR,                ///< The ERR (Error) port implementation
     Service_APT,                ///< The APT (Applets) service
     Service_GSP,                ///< The GSP (GPU control) service
     Service_AC,                 ///< The AC (WiFi status) service
diff --git a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
index b691ffbc3..3b508f617 100644
--- a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
+++ b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
@@ -4422,12 +4422,7 @@ unsigned InterpreterMainLoop(ARMul_State* state) {
             inst_cream->get_addr(cpu, inst_cream->inst, addr, 1);
 
             unsigned int value = Memory::Read32(addr);
-            if (BIT(CP15_REG(CP15_CONTROL), 22) == 1)
-                cpu->Reg[BITS(inst_cream->inst, 12, 15)] = value;
-            else {
-                value = ROTATE_RIGHT_32(value,(8*(addr&0x3)));
-                cpu->Reg[BITS(inst_cream->inst, 12, 15)] = value;
-            }
+            cpu->Reg[BITS(inst_cream->inst, 12, 15)] = value;
 
             if (BITS(inst_cream->inst, 12, 15) == 15) {
                 // For armv5t, should enter thumb when bits[0] is non-zero.
@@ -4450,12 +4445,7 @@ unsigned InterpreterMainLoop(ARMul_State* state) {
             inst_cream->get_addr(cpu, inst_cream->inst, addr, 1);
 
             unsigned int value = Memory::Read32(addr);
-            if (BIT(CP15_REG(CP15_CONTROL), 22) == 1)
-                cpu->Reg[BITS(inst_cream->inst, 12, 15)] = value;
-            else {
-                value = ROTATE_RIGHT_32(value,(8*(addr&0x3)));
-                cpu->Reg[BITS(inst_cream->inst, 12, 15)] = value;
-            }
+            cpu->Reg[BITS(inst_cream->inst, 12, 15)] = value;
 
             if (BITS(inst_cream->inst, 12, 15) == 15) {
                 // For armv5t, should enter thumb when bits[0] is non-zero.
@@ -4699,11 +4689,6 @@ unsigned InterpreterMainLoop(ARMul_State* state) {
             unsigned int value = Memory::Read32(addr);
             cpu->Reg[BITS(inst_cream->inst, 12, 15)] = value;
 
-            if (BIT(CP15_REG(CP15_CONTROL), 22) == 1)
-                cpu->Reg[BITS(inst_cream->inst, 12, 15)] = value;
-            else
-                cpu->Reg[BITS(inst_cream->inst, 12, 15)] = ROTATE_RIGHT_32(value,(8*(addr&0x3))) ;
-
             if (BITS(inst_cream->inst, 12, 15) == 15) {
                 INC_PC(sizeof(ldst_inst));
                 goto DISPATCH;
diff --git a/src/core/hle/service/err_f.cpp b/src/core/hle/service/err_f.cpp
index 8d765acb5..58c5acd1e 100644
--- a/src/core/hle/service/err_f.cpp
+++ b/src/core/hle/service/err_f.cpp
@@ -10,8 +10,171 @@
 
 namespace ERR_F {
 
+enum {
+    ErrSpecifier0 = 0,
+    ErrSpecifier1 = 1,
+    ErrSpecifier3 = 3,
+    ErrSpecifier4 = 4,
+};
+
+// This is used instead of ResultCode from result.h
+// because we can't have non-trivial data members in unions.
+union RSL {
+    u32 raw;
+
+    BitField<0, 10, u32> description;
+    BitField<10, 8, u32> module;
+    BitField<21, 6, u32> summary;
+    BitField<27, 5, u32> level;
+};
+
+union ErrInfo {
+    u8 specifier;
+
+    struct {
+        u8 specifier;                // 0x0
+        u8 rev_high;                 // 0x1
+        u16 rev_low;                 // 0x2
+        RSL result_code;             // 0x4
+        u32 address;                 // 0x8
+        INSERT_PADDING_BYTES(4);     // 0xC
+        u32 pid_low;                 // 0x10
+        u32 pid_high;                // 0x14
+        u32 aid_low;                 // 0x18
+        u32 aid_high;                // 0x1C
+    } errtype1;
+
+    struct {
+        u8 specifier;                // 0x0
+        u8 rev_high;                 // 0x1
+        u16 rev_low;                 // 0x2
+        INSERT_PADDING_BYTES(0xC);   // 0x4
+        u32 pid_low;                 // 0x10
+        u32 pid_high;                // 0x14
+        u32 aid_low;                 // 0x18
+        u32 aid_high;                // 0x1C
+        u8 error_type;               // 0x20
+        INSERT_PADDING_BYTES(3);     // 0x21
+        u32 fault_status_reg;        // 0x24
+        u32 fault_addr;              // 0x28
+        u32 fpexc;                   // 0x2C
+        u32 finst;                   // 0x30
+        u32 finst2;                  // 0x34
+        INSERT_PADDING_BYTES(0x34);  // 0x38
+        u32 sp;                      // 0x6C
+        u32 pc;                      // 0x70
+        u32 lr;                      // 0x74
+        u32 cpsr;                    // 0x78
+    } errtype3;
+
+    struct {
+        u8 specifier;                // 0x0
+        u8 rev_high;                 // 0x1
+        u16 rev_low;                 // 0x2
+        RSL result_code;             // 0x4
+        INSERT_PADDING_BYTES(8);     // 0x8
+        u32 pid_low;                 // 0x10
+        u32 pid_high;                // 0x14
+        u32 aid_low;                 // 0x18
+        u32 aid_high;                // 0x1C
+        char debug_string1[0x2E];    // 0x20
+        char debug_string2[0x2E];    // 0x4E
+    } errtype4;
+};
+
+enum {
+    PrefetchAbort = 0,
+    DataAbort     = 1,
+    UndefInstr    = 2,
+    VectorFP      = 3
+};
+
+static std::string GetErrInfo3Type(u8 type_code) {
+    switch (type_code) {
+    case PrefetchAbort: return "Prefetch Abort";
+    case DataAbort:     return "Data Abort";
+    case UndefInstr:    return "Undefined Instruction";
+    case VectorFP:      return "Vector Floating Point";
+    default: return "unknown";
+    }
+}
+
+static void ThrowFatalError(Service::Interface* self) {
+    u32* cmd_buff = Kernel::GetCommandBuffer();
+
+    LOG_CRITICAL(Service_ERR, "Fatal error!");
+    const ErrInfo* errinfo = reinterpret_cast<ErrInfo*>(&cmd_buff[1]);
+
+    switch (errinfo->specifier) {
+    case ErrSpecifier0:
+    case ErrSpecifier1:
+    {
+        const auto& errtype = errinfo->errtype1;
+        LOG_CRITICAL(Service_ERR, "PID: 0x%08X_0x%08X", errtype.pid_low, errtype.pid_high);
+        LOG_CRITICAL(Service_ERR, "REV: %d", errtype.rev_low | (errtype.rev_high << 16));
+        LOG_CRITICAL(Service_ERR, "AID: 0x%08X_0x%08X", errtype.aid_low, errtype.aid_high);
+        LOG_CRITICAL(Service_ERR, "ADR: 0x%08X", errtype.address);
+
+        LOG_CRITICAL(Service_ERR, "RSL: 0x%08X", errtype.result_code.raw);
+        LOG_CRITICAL(Service_ERR, "  Level: %u",   errtype.result_code.level.Value());
+        LOG_CRITICAL(Service_ERR, "  Summary: %u", errtype.result_code.summary.Value());
+        LOG_CRITICAL(Service_ERR, "  Module: %u",  errtype.result_code.module.Value());
+        LOG_CRITICAL(Service_ERR, "  Desc: %u",    errtype.result_code.description.Value());
+        break;
+    }
+
+    case ErrSpecifier3:
+    {
+        const auto& errtype = errinfo->errtype3;
+        LOG_CRITICAL(Service_ERR, "PID: 0x%08X_0x%08X", errtype.pid_low, errtype.pid_high);
+        LOG_CRITICAL(Service_ERR, "REV: %d", errtype.rev_low | (errtype.rev_high << 16));
+        LOG_CRITICAL(Service_ERR, "AID: 0x%08X_0x%08X", errtype.aid_low, errtype.aid_high);
+        LOG_CRITICAL(Service_ERR, "TYPE: %s", GetErrInfo3Type(errtype.error_type).c_str());
+
+        LOG_CRITICAL(Service_ERR, "PC: 0x%08X", errtype.pc);
+        LOG_CRITICAL(Service_ERR, "LR: 0x%08X", errtype.lr);
+        LOG_CRITICAL(Service_ERR, "SP: 0x%08X", errtype.sp);
+        LOG_CRITICAL(Service_ERR, "CPSR: 0x%08X", errtype.cpsr);
+
+        switch (errtype.error_type) {
+        case PrefetchAbort:
+        case DataAbort:
+            LOG_CRITICAL(Service_ERR, "Fault Address: 0x%08X", errtype.fault_addr);
+            LOG_CRITICAL(Service_ERR, "Fault Status Register: 0x%08X", errtype.fault_status_reg);
+            break;
+        case VectorFP:
+            LOG_CRITICAL(Service_ERR, "FPEXC: 0x%08X", errtype.fpexc);
+            LOG_CRITICAL(Service_ERR, "FINST: 0x%08X", errtype.finst);
+            LOG_CRITICAL(Service_ERR, "FINST2: 0x%08X", errtype.finst2);
+            break;
+        }
+        break;
+    }
+
+    case ErrSpecifier4:
+    {
+        const auto& errtype = errinfo->errtype4;
+        LOG_CRITICAL(Service_ERR, "PID: 0x%08X_0x%08X", errtype.pid_low, errtype.pid_high);
+        LOG_CRITICAL(Service_ERR, "REV: %d", errtype.rev_low | (errtype.rev_high << 16));
+        LOG_CRITICAL(Service_ERR, "AID: 0x%08X_0x%08X", errtype.aid_low, errtype.aid_high);
+
+        LOG_CRITICAL(Service_ERR, "RSL: 0x%08X", errtype.result_code.raw);
+        LOG_CRITICAL(Service_ERR, "  Level: %u",   errtype.result_code.level.Value());
+        LOG_CRITICAL(Service_ERR, "  Summary: %u", errtype.result_code.summary.Value());
+        LOG_CRITICAL(Service_ERR, "  Module: %u",  errtype.result_code.module.Value());
+        LOG_CRITICAL(Service_ERR, "  Desc: %u",    errtype.result_code.description.Value());
+
+        LOG_CRITICAL(Service_ERR, "%s", errtype.debug_string1);
+        LOG_CRITICAL(Service_ERR, "%s", errtype.debug_string2);
+        break;
+    }
+    }
+
+    cmd_buff[1] = 0; // No error
+}
+
 const Interface::FunctionInfo FunctionTable[] = {
-    {0x00010800, nullptr,               "ThrowFatalError"}
+    {0x00010800, ThrowFatalError,           "ThrowFatalError"}
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
index bd7d92cd1..e6022d69f 100644
--- a/src/core/hw/gpu.cpp
+++ b/src/core/hw/gpu.cpp
@@ -19,6 +19,7 @@
 
 #include "video_core/command_processor.h"
 #include "video_core/video_core.h"
+#include <video_core/color.h>
 
 
 namespace GPU {
@@ -132,12 +133,31 @@ inline void Write(u32 addr, const T data) {
                     switch (config.input_format) {
                     case Regs::PixelFormat::RGBA8:
                     {
-                        // TODO: Most likely got the component order messed up.
                         u8* srcptr = source_pointer + (x * pixel_skip + y * config.input_width) * 4;
-                        source_color.r = srcptr[0]; // blue
-                        source_color.g = srcptr[1]; // green
-                        source_color.b = srcptr[2]; // red
-                        source_color.a = srcptr[3]; // alpha
+                        source_color.r = srcptr[3]; // red
+                        source_color.g = srcptr[2]; // green
+                        source_color.b = srcptr[1]; // blue
+                        source_color.a = srcptr[0]; // alpha
+                        break;
+                    }
+
+                    case Regs::PixelFormat::RGB5A1:
+                    {
+                        u16 srcval = *(u16*)(source_pointer + x * 4 * pixel_skip + y * config.input_width * 4 * pixel_skip);
+                        source_color.r = Color::Convert5To8((srcval >> 11) & 0x1F); // red
+                        source_color.g = Color::Convert5To8((srcval >>  6) & 0x1F); // green
+                        source_color.b = Color::Convert5To8((srcval >>  1) & 0x1F); // blue
+                        source_color.a = Color::Convert1To8(srcval & 0x1);          // alpha
+                        break;
+                    }
+
+                    case Regs::PixelFormat::RGBA4:
+                    {
+                        u16 srcval = *(u16*)(source_pointer + x * 4 * pixel_skip + y * config.input_width * 4 * pixel_skip);
+                        source_color.r = Color::Convert4To8((srcval >> 12) & 0xF); // red
+                        source_color.g = Color::Convert4To8((srcval >>  8) & 0xF); // green
+                        source_color.b = Color::Convert4To8((srcval >>  4) & 0xF); // blue
+                        source_color.a = Color::Convert4To8( srcval        & 0xF); // alpha
                         break;
                     }
 
@@ -147,24 +167,38 @@ inline void Write(u32 addr, const T data) {
                     }
 
                     switch (config.output_format) {
-                    /*case Regs::PixelFormat::RGBA8:
+                    case Regs::PixelFormat::RGBA8:
                     {
-                        // TODO: Untested
-                        u8* dstptr = (u32*)(dest_pointer + x * 4 + y * config.output_width * 4);
-                        dstptr[0] = source_color.r;
-                        dstptr[1] = source_color.g;
-                        dstptr[2] = source_color.b;
-                        dstptr[3] = source_color.a;
+                        u8* dstptr = dest_pointer + (x * pixel_skip + y * config.output_width) * 4;
+                        dstptr[3] = source_color.r;
+                        dstptr[2] = source_color.g;
+                        dstptr[1] = source_color.b;
+                        dstptr[0] = source_color.a;
                         break;
-                    }*/
+                    }
 
                     case Regs::PixelFormat::RGB8:
                     {
-                        // TODO: Most likely got the component order messed up.
                         u8* dstptr = dest_pointer + (x + y * output_width) * 3;
-                        dstptr[0] = source_color.r; // blue
+                        dstptr[2] = source_color.r; // red
                         dstptr[1] = source_color.g; // green
-                        dstptr[2] = source_color.b; // red
+                        dstptr[0] = source_color.b; // blue
+                        break;
+                    }
+
+                    case Regs::PixelFormat::RGB5A1:
+                    {
+                        u16* dstptr = (u16*)(dest_pointer + x * 2 + y * config.output_width * 2);
+                        *dstptr = ((source_color.r >> 3) << 11) | ((source_color.g >> 3) << 6)
+                                | ((source_color.b >> 3) <<  1) | ( source_color.a >> 7);
+                        break;
+                    }
+
+                    case Regs::PixelFormat::RGBA4:
+                    {
+                        u16* dstptr = (u16*)(dest_pointer + x * 2 + y * config.output_width * 2);
+                        *dstptr = ((source_color.r >> 4) << 12) | ((source_color.g >> 4) << 8)
+                                | ((source_color.b >> 4) <<  4) | ( source_color.a >> 4);
                         break;
                     }
 
diff --git a/src/core/mem_map_funcs.cpp b/src/core/mem_map_funcs.cpp
index 4f93c0e64..48f61db4e 100644
--- a/src/core/mem_map_funcs.cpp
+++ b/src/core/mem_map_funcs.cpp
@@ -236,30 +236,12 @@ u8 Read8(const VAddr addr) {
 u16 Read16(const VAddr addr) {
     u16_le data = 0;
     Read<u16_le>(data, addr);
-
-    // Check for 16-bit unaligned memory reads...
-    if (addr & 1) {
-        // TODO(bunnei): Implement 16-bit unaligned memory reads
-        LOG_ERROR(HW_Memory, "16-bit unaligned memory reads are not implemented!");
-    }
-
     return (u16)data;
 }
 
 u32 Read32(const VAddr addr) {
     u32_le data = 0;
     Read<u32_le>(data, addr);
-
-    // Check for 32-bit unaligned memory reads...
-    if (addr & 3) {
-        // ARM allows for unaligned memory reads, however older ARM architectures read out memory
-        // from unaligned addresses in a shifted way. Our ARM CPU core (SkyEye) corrects for this,
-        // so therefore expects the memory to be read out in this manner.
-        // TODO(bunnei): Determine if this is necessary - perhaps it is OK to remove this from both
-        // SkyEye and here?
-        int shift = (addr & 3) * 8;
-        data = (data << shift) | (data >> (32 - shift));
-    }
     return (u32)data;
 }
 
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 81df09baf..17f8f70ca 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -20,7 +20,7 @@ namespace Rasterizer {
 
 static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
     const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress();
-    u32* color_buffer = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr)));
+    u8* color_buffer = Memory::GetPointer(PAddrToVAddr(addr));
 
     // Similarly to textures, the render framebuffer is laid out from bottom to top, too.
     // NOTE: The framebuffer height register contains the actual FB height minus one.
@@ -29,8 +29,11 @@ static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
     switch (registers.framebuffer.color_format) {
     case registers.framebuffer.RGBA8:
     {
-        u32 value = (color.a() << 24) | (color.r() << 16) | (color.g() << 8) | color.b();
-        *(color_buffer + x + y * registers.framebuffer.GetWidth()) = value;
+        u8* pixel = color_buffer + (x + y * registers.framebuffer.GetWidth()) * 4;
+        pixel[3] = color.r();
+        pixel[2] = color.g();
+        pixel[1] = color.b();
+        pixel[0] = color.a();
         break;
     }
 
@@ -42,17 +45,27 @@ static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
 
 static const Math::Vec4<u8> GetPixel(int x, int y) {
     const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress();
-    u32* color_buffer_u32 = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr)));
+    u8* color_buffer = Memory::GetPointer(PAddrToVAddr(addr));
 
     y = (registers.framebuffer.height - y);
 
-    u32 value = *(color_buffer_u32 + x + y * registers.framebuffer.GetWidth());
-    Math::Vec4<u8> ret;
-    ret.a() = value >> 24;
-    ret.r() = (value >> 16) & 0xFF;
-    ret.g() = (value >> 8) & 0xFF;
-    ret.b() = value & 0xFF;
-    return ret;
+    switch (registers.framebuffer.color_format) {
+    case registers.framebuffer.RGBA8:
+    {
+        Math::Vec4<u8> ret;
+        u8* pixel = color_buffer + (x + y * registers.framebuffer.GetWidth()) * 4;
+        ret.r() = pixel[3];
+        ret.g() = pixel[2];
+        ret.b() = pixel[1];
+        ret.a() = pixel[0];
+        return ret;
+    }
+    default:
+        LOG_CRITICAL(Render_Software, "Unknown framebuffer color format %x", registers.framebuffer.color_format);
+        UNIMPLEMENTED();
+    }
+
+    return {};
  }
 
 static u32 GetDepth(int x, int y) {
diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
index def868ac7..bc8c0041c 100644
--- a/src/video_core/vertex_shader.cpp
+++ b/src/video_core/vertex_shader.cpp
@@ -90,6 +90,7 @@ struct VertexShaderState {
         u8 repeat_counter;  // How often to repeat until this call stack element is removed
         u8 loop_increment;  // Which value to add to the loop counter after an iteration
                             // TODO: Should this be a signed value? Does it even matter?
+        u32 loop_address;   // The address where we'll return to after each loop iteration
     };
 
     // TODO: Is there a maximal size for this?
@@ -115,6 +116,8 @@ static void ProcessShaderCode(VertexShaderState& state) {
                 if (top.repeat_counter-- == 0) {
                     state.program_counter = &shader_memory[top.return_address];
                     state.call_stack.pop();
+                } else {
+                    state.program_counter = &shader_memory[top.loop_address];
                 }
 
                 // TODO: Is "trying again" accurate to hardware?
@@ -129,7 +132,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
         static auto call = [](VertexShaderState& state, u32 offset, u32 num_instructions,
                               u32 return_offset, u8 repeat_count, u8 loop_increment) {
             state.program_counter = &shader_memory[offset] - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
-            state.call_stack.push({ offset + num_instructions, return_offset, repeat_count, loop_increment });
+            state.call_stack.push({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset });
         };
         u32 binary_offset = state.program_counter - shader_memory.data();