4 files changed, 137 insertions, 68 deletions
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index e031871e8..c4cdf672b 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -27,6 +27,10 @@ static int float_regs_counter = 0;
 
 static u32 uniform_write_buffer[4];
 
+static int default_attr_counter = 0;
+
+static u32 default_attr_write_buffer[3];
+
 Common::Profiling::TimingCategory category_drawing("Drawing");
 
 static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
@@ -71,12 +75,9 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
             u32 vertex_attribute_sources[16];
             boost::fill(vertex_attribute_sources, 0xdeadbeef);
             u32 vertex_attribute_strides[16];
-            u32 vertex_attribute_formats[16];
+            Regs::VertexAttributeFormat vertex_attribute_formats[16];
 
-            // HACK: Initialize vertex_attribute_elements to zero to prevent infinite loops below.
-            // This is one of the hacks required to deal with uninitalized vertex attributes.
-            // TODO: Fix this properly.
-            u32 vertex_attribute_elements[16] = {};
+            u32 vertex_attribute_elements[16];
             u32 vertex_attribute_element_size[16];
 
             // Setup attribute data from loaders
@@ -90,7 +91,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
                     u32 attribute_index = loader_config.GetComponent(component);
                     vertex_attribute_sources[attribute_index] = load_address;
                     vertex_attribute_strides[attribute_index] = static_cast<u32>(loader_config.byte_count);
-                    vertex_attribute_formats[attribute_index] = static_cast<u32>(attribute_config.GetFormat(attribute_index));
+                    vertex_attribute_formats[attribute_index] = attribute_config.GetFormat(attribute_index);
                     vertex_attribute_elements[attribute_index] = attribute_config.GetNumElements(attribute_index);
                     vertex_attribute_element_size[attribute_index] = attribute_config.GetElementSizeInBytes(attribute_index);
                     load_address += attribute_config.GetStride(attribute_index);
@@ -126,26 +127,29 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
                 input.attr[0].w = debug_token;
 
                 for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) {
-                    for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
-                        const u8* srcdata = Memory::GetPointer(PAddrToVAddr(vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i]));
-
-                        // TODO(neobrain): Ocarina of Time 3D has GetNumTotalAttributes return 8,
-                        // yet only provides 2 valid source data addresses. Need to figure out
-                        // what's wrong there, until then we just continue when address lookup fails
-                        if (srcdata == nullptr)
-                            continue;
-
-                        const float srcval = (vertex_attribute_formats[i] == 0) ? *(s8*)srcdata :
-                                             (vertex_attribute_formats[i] == 1) ? *(u8*)srcdata :
-                                             (vertex_attribute_formats[i] == 2) ? *(s16*)srcdata :
-                                                                                  *(float*)srcdata;
-                        input.attr[i][comp] = float24::FromFloat32(srcval);
-                        LOG_TRACE(HW_GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08lx + 0x%04lx: %f",
-                                  comp, i, vertex, index,
-                                  attribute_config.GetPhysicalBaseAddress(),
-                                  vertex_attribute_sources[i] - base_address,
-                                  vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i],
-                                  input.attr[i][comp].ToFloat32());
+                    if (attribute_config.IsDefaultAttribute(i)) {
+                        input.attr[i] = VertexShader::GetDefaultAttribute(i);
+                        LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)",
+                                  i, vertex, index,
+                                  input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(),
+                                  input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32());
+                    } else {
+                        for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
+                            const u8* srcdata = Memory::GetPointer(PAddrToVAddr(vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i]));
+
+                            const float srcval = (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::BYTE) ? *(s8*)srcdata :
+                                (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::UBYTE) ? *(u8*)srcdata :
+                                (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? *(s16*)srcdata :
+                                *(float*)srcdata;
+
+                            input.attr[i][comp] = float24::FromFloat32(srcval);
+                            LOG_TRACE(HW_GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08lx + 0x%04lx: %f",
+                                      comp, i, vertex, index,
+                                      attribute_config.GetPhysicalBaseAddress(),
+                                      vertex_attribute_sources[i] - base_address,
+                                      vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i],
+                                      input.attr[i][comp].ToFloat32());
+                        }
                     }
                 }
 
@@ -224,7 +228,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
             //       it directly write the values?
             uniform_write_buffer[float_regs_counter++] = value;
 
-            // Uniforms are written in a packed format such that 4 float24 values are encoded in
+            // Uniforms are written in a packed format such that four float24 values are encoded in
             // three 32-bit numbers. We write to internal memory once a full such vector is
             // written.
             if ((float_regs_counter >= 4 && uniform_setup.IsFloat32()) ||
@@ -259,6 +263,46 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
             }
             break;
         }
+        
+        // Load default vertex input attributes
+        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[0], 0x233):
+        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[1], 0x234):
+        case PICA_REG_INDEX_WORKAROUND(vs_default_attributes_setup.set_value[2], 0x235):
+        {
+            // TODO: Does actual hardware indeed keep an intermediate buffer or does
+            //       it directly write the values?
+            default_attr_write_buffer[default_attr_counter++] = value;
+
+            // Default attributes are written in a packed format such that four float24 values are encoded in
+            // three 32-bit numbers. We write to internal memory once a full such vector is
+            // written.
+            if (default_attr_counter >= 3) {
+                default_attr_counter = 0;
+
+                auto& setup = registers.vs_default_attributes_setup;
+
+                if (setup.index >= 16) {
+                    LOG_ERROR(HW_GPU, "Invalid VS default attribute index %d", (int)setup.index);
+                    break;
+                }
+
+                Math::Vec4<float24>& attribute = VertexShader::GetDefaultAttribute(setup.index);
+                
+                // NOTE: The destination component order indeed is "backwards"
+                attribute.w = float24::FromRawFloat24(default_attr_write_buffer[0] >> 8);
+                attribute.z = float24::FromRawFloat24(((default_attr_write_buffer[0] & 0xFF) << 16) | ((default_attr_write_buffer[1] >> 16) & 0xFFFF));
+                attribute.y = float24::FromRawFloat24(((default_attr_write_buffer[1] & 0xFFFF) << 8) | ((default_attr_write_buffer[2] >> 24) & 0xFF));
+                attribute.x = float24::FromRawFloat24(default_attr_write_buffer[2] & 0xFFFFFF);
+
+                LOG_TRACE(HW_GPU, "Set default VS attribute %x to (%f %f %f %f)", (int)setup.index,
+                          attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(),
+                          attribute.w.ToFloat32());
+
+                // TODO: Verify that this actually modifies the register!
+                setup.index = setup.index + 1;
+            }
+            break;
+        }
 
         // Load shader program code
         case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[0], 0x2cc):
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index fe20cd77d..8acad8676 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -489,14 +489,14 @@ struct Regs {
 
     INSERT_PADDING_WORDS(0xe0);
 
-    struct {
-        enum class Format : u64 {
-            BYTE = 0,
-            UBYTE = 1,
-            SHORT = 2,
-            FLOAT = 3,
-        };
+    enum class VertexAttributeFormat : u64 {
+        BYTE = 0,
+        UBYTE = 1,
+        SHORT = 2,
+        FLOAT = 3,
+    };
 
+    struct {
         BitField<0, 29, u32> base_address;
 
         u32 GetPhysicalBaseAddress() const {
@@ -505,29 +505,29 @@ struct Regs {
 
         // Descriptor for internal vertex attributes
         union {
-            BitField< 0,  2, Format> format0; // size of one element
+            BitField< 0,  2, VertexAttributeFormat> format0; // size of one element
             BitField< 2,  2, u64> size0;      // number of elements minus 1
-            BitField< 4,  2, Format> format1;
+            BitField< 4,  2, VertexAttributeFormat> format1;
             BitField< 6,  2, u64> size1;
-            BitField< 8,  2, Format> format2;
+            BitField< 8,  2, VertexAttributeFormat> format2;
             BitField<10,  2, u64> size2;
-            BitField<12,  2, Format> format3;
+            BitField<12,  2, VertexAttributeFormat> format3;
             BitField<14,  2, u64> size3;
-            BitField<16,  2, Format> format4;
+            BitField<16,  2, VertexAttributeFormat> format4;
             BitField<18,  2, u64> size4;
-            BitField<20,  2, Format> format5;
+            BitField<20,  2, VertexAttributeFormat> format5;
             BitField<22,  2, u64> size5;
-            BitField<24,  2, Format> format6;
+            BitField<24,  2, VertexAttributeFormat> format6;
             BitField<26,  2, u64> size6;
-            BitField<28,  2, Format> format7;
+            BitField<28,  2, VertexAttributeFormat> format7;
             BitField<30,  2, u64> size7;
-            BitField<32,  2, Format> format8;
+            BitField<32,  2, VertexAttributeFormat> format8;
             BitField<34,  2, u64> size8;
-            BitField<36,  2, Format> format9;
+            BitField<36,  2, VertexAttributeFormat> format9;
             BitField<38,  2, u64> size9;
-            BitField<40,  2, Format> format10;
+            BitField<40,  2, VertexAttributeFormat> format10;
             BitField<42,  2, u64> size10;
-            BitField<44,  2, Format> format11;
+            BitField<44,  2, VertexAttributeFormat> format11;
             BitField<46,  2, u64> size11;
 
             BitField<48, 12, u64> attribute_mask;
@@ -536,8 +536,8 @@ struct Regs {
             BitField<60,  4, u64> num_extra_attributes;
         };
 
-        inline Format GetFormat(int n) const {
-            Format formats[] = {
+        inline VertexAttributeFormat GetFormat(int n) const {
+            VertexAttributeFormat formats[] = {
                 format0, format1, format2, format3,
                 format4, format5, format6, format7,
                 format8, format9, format10, format11
@@ -555,14 +555,18 @@ struct Regs {
         }
 
         inline int GetElementSizeInBytes(int n) const {
-            return (GetFormat(n) == Format::FLOAT) ? 4 :
-                (GetFormat(n) == Format::SHORT) ? 2 : 1;
+            return (GetFormat(n) == VertexAttributeFormat::FLOAT) ? 4 :
+                (GetFormat(n) == VertexAttributeFormat::SHORT) ? 2 : 1;
         }
 
         inline int GetStride(int n) const {
             return GetNumElements(n) * GetElementSizeInBytes(n);
         }
 
+        inline bool IsDefaultAttribute(int id) const {
+            return (id >= 12) || (attribute_mask & (1 << id)) != 0;
+        }
+
         inline int GetNumTotalAttributes() const {
             return (int)num_extra_attributes+1;
         }
@@ -625,7 +629,18 @@ struct Regs {
     u32 trigger_draw;
     u32 trigger_draw_indexed;
 
-    INSERT_PADDING_WORDS(0x2e);
+    INSERT_PADDING_WORDS(0x2);
+
+    // These registers are used to setup the default "fall-back" vertex shader attributes
+    struct {
+        // Index of the current default attribute
+        u32 index;
+        
+        // Writing to these registers sets the "current" default attribute.
+        u32 set_value[3];
+    } vs_default_attributes_setup;
+    
+    INSERT_PADDING_WORDS(0x28);
 
     enum class TriangleTopology : u32 {
         List        = 0,
@@ -669,7 +684,7 @@ struct Regs {
         BitField<56, 4, u64> attribute14_register;
         BitField<60, 4, u64> attribute15_register;
 
-        int GetRegisterForAttribute(int attribute_index) {
+        int GetRegisterForAttribute(int attribute_index) const {
             u64 fields[] = {
                 attribute0_register,  attribute1_register,  attribute2_register,  attribute3_register,
                 attribute4_register,  attribute5_register,  attribute6_register,  attribute7_register,
@@ -775,6 +790,7 @@ struct Regs {
         ADD_FIELD(num_vertices);
         ADD_FIELD(trigger_draw);
         ADD_FIELD(trigger_draw_indexed);
+        ADD_FIELD(vs_default_attributes_setup);
         ADD_FIELD(triangle_topology);
         ADD_FIELD(vs_bool_uniforms);
         ADD_FIELD(vs_int_uniforms);
@@ -849,6 +865,7 @@ ASSERT_REG_POSITION(index_array, 0x227);
 ASSERT_REG_POSITION(num_vertices, 0x228);
 ASSERT_REG_POSITION(trigger_draw, 0x22e);
 ASSERT_REG_POSITION(trigger_draw_indexed, 0x22f);
+ASSERT_REG_POSITION(vs_default_attributes_setup, 0x232);
 ASSERT_REG_POSITION(triangle_topology, 0x25e);
 ASSERT_REG_POSITION(vs_bool_uniforms, 0x2b0);
 ASSERT_REG_POSITION(vs_int_uniforms, 0x2b1);
diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
index e8d865172..51f4e58bf 100644
--- a/src/video_core/vertex_shader.cpp
+++ b/src/video_core/vertex_shader.cpp
@@ -35,6 +35,8 @@ static struct {
     std::array<Math::Vec4<u8>,4> i;
 } shader_uniforms;
 
+static Math::Vec4<float24> vs_default_attributes[16];
+
 // TODO: Not sure where the shader binary and swizzle patterns are supposed to be loaded to!
 // For now, we just keep these local arrays around.
 static std::array<u32, 1024> shader_memory;
@@ -60,6 +62,10 @@ Math::Vec4<u8>& GetIntUniform(u32 index) {
     return shader_uniforms.i[index];
 }
 
+Math::Vec4<float24>& GetDefaultAttribute(u32 index) {
+    return vs_default_attributes[index];
+}
+
 const std::array<u32, 1024>& GetShaderBinary() {
     return shader_memory;
 }
@@ -568,22 +574,23 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes) {
     const auto& attribute_register_map = registers.vs_input_register_map;
     float24 dummy_register;
     boost::fill(state.input_register_table, &dummy_register);
-    if(num_attributes > 0) state.input_register_table[attribute_register_map.attribute0_register] = &input.attr[0].x;
-    if(num_attributes > 1) state.input_register_table[attribute_register_map.attribute1_register] = &input.attr[1].x;
-    if(num_attributes > 2) state.input_register_table[attribute_register_map.attribute2_register] = &input.attr[2].x;
-    if(num_attributes > 3) state.input_register_table[attribute_register_map.attribute3_register] = &input.attr[3].x;
-    if(num_attributes > 4) state.input_register_table[attribute_register_map.attribute4_register] = &input.attr[4].x;
-    if(num_attributes > 5) state.input_register_table[attribute_register_map.attribute5_register] = &input.attr[5].x;
-    if(num_attributes > 6) state.input_register_table[attribute_register_map.attribute6_register] = &input.attr[6].x;
-    if(num_attributes > 7) state.input_register_table[attribute_register_map.attribute7_register] = &input.attr[7].x;
-    if(num_attributes > 8) state.input_register_table[attribute_register_map.attribute8_register] = &input.attr[8].x;
-    if(num_attributes > 9) state.input_register_table[attribute_register_map.attribute9_register] = &input.attr[9].x;
-    if(num_attributes > 10) state.input_register_table[attribute_register_map.attribute10_register] = &input.attr[10].x;
-    if(num_attributes > 11) state.input_register_table[attribute_register_map.attribute11_register] = &input.attr[11].x;
-    if(num_attributes > 12) state.input_register_table[attribute_register_map.attribute12_register] = &input.attr[12].x;
-    if(num_attributes > 13) state.input_register_table[attribute_register_map.attribute13_register] = &input.attr[13].x;
-    if(num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x;
-    if(num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x;
+    
+    if (num_attributes > 0) state.input_register_table[attribute_register_map.attribute0_register] = &input.attr[0].x;
+    if (num_attributes > 1) state.input_register_table[attribute_register_map.attribute1_register] = &input.attr[1].x;
+    if (num_attributes > 2) state.input_register_table[attribute_register_map.attribute2_register] = &input.attr[2].x;
+    if (num_attributes > 3) state.input_register_table[attribute_register_map.attribute3_register] = &input.attr[3].x;
+    if (num_attributes > 4) state.input_register_table[attribute_register_map.attribute4_register] = &input.attr[4].x;
+    if (num_attributes > 5) state.input_register_table[attribute_register_map.attribute5_register] = &input.attr[5].x;
+    if (num_attributes > 6) state.input_register_table[attribute_register_map.attribute6_register] = &input.attr[6].x;
+    if (num_attributes > 7) state.input_register_table[attribute_register_map.attribute7_register] = &input.attr[7].x;
+    if (num_attributes > 8) state.input_register_table[attribute_register_map.attribute8_register] = &input.attr[8].x;
+    if (num_attributes > 9) state.input_register_table[attribute_register_map.attribute9_register] = &input.attr[9].x;
+    if (num_attributes > 10) state.input_register_table[attribute_register_map.attribute10_register] = &input.attr[10].x;
+    if (num_attributes > 11) state.input_register_table[attribute_register_map.attribute11_register] = &input.attr[11].x;
+    if (num_attributes > 12) state.input_register_table[attribute_register_map.attribute12_register] = &input.attr[12].x;
+    if (num_attributes > 13) state.input_register_table[attribute_register_map.attribute13_register] = &input.attr[13].x;
+    if (num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x;
+    if (num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x;
 
     state.conditional_code[0] = false;
     state.conditional_code[1] = false;
diff --git a/src/video_core/vertex_shader.h b/src/video_core/vertex_shader.h
index 3a68a3409..c26709bbc 100644
--- a/src/video_core/vertex_shader.h
+++ b/src/video_core/vertex_shader.h
@@ -74,6 +74,7 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes);
 Math::Vec4<float24>& GetFloatUniform(u32 index);
 bool& GetBoolUniform(u32 index);
 Math::Vec4<u8>& GetIntUniform(u32 index);
+Math::Vec4<float24>& GetDefaultAttribute(u32 index);
 
 const std::array<u32, 1024>& GetShaderBinary();
 const std::array<u32, 1024>& GetSwizzlePatterns();